[flang][acc] Lower do and do concurrent loops specially in acc regions (#149614)

When OpenACC is enabled and Fortran loops are annotated with `acc loop`,
they are lowered to `acc.loop` operation. And rest of the contained
loops use the normal FIR lowering path.

Hovever, the OpenACC specification has special provisions related to
contained loops and their induction variable. In order to adhere to
this, we convert all valid contained loops to `acc.loop` in order to
store this information appropriately.

The provisions in the spec that motivated this change (line numbers are
from OpenACC 3.4):
- 1353 Loop variables in Fortran do statements within a compute
construct are predetermined to be private to the thread that executes
the loop.
- 3783 When do concurrent appears without a loop construct in a kernels
construct it is treated as if it is annotated with loop auto. If it
appears in a parallel construct or an accelerator routine then it is
treated as if it is annotated with loop independent.

By valid loops - we convert do loops and do concurrent loops which have
induction variable. Loops which are unstructured are not handled.
This commit is contained in:
Razvan Lupusoru 2025-07-29 10:03:22 -07:00 committed by GitHub
parent 2a3f72ee6e
commit 4128cf3b26
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 774 additions and 121 deletions

View File

@ -43,6 +43,7 @@ struct ProcedureDesignator;
namespace parser {
struct AccClauseList;
struct DoConstruct;
struct OpenACCConstruct;
struct OpenACCDeclarativeConstruct;
struct OpenACCRoutineConstruct;
@ -58,6 +59,7 @@ namespace lower {
class AbstractConverter;
class StatementContext;
class SymMap;
namespace pft {
struct Evaluation;
@ -114,14 +116,32 @@ void attachDeclarePostDeallocAction(AbstractConverter &, fir::FirOpBuilder &,
void genOpenACCTerminator(fir::FirOpBuilder &, mlir::Operation *,
mlir::Location);
int64_t getLoopCountForCollapseAndTile(const Fortran::parser::AccClauseList &);
/// Used to obtain the number of contained loops to look for
/// since this is dependent on number of tile operands and collapse
/// clause.
uint64_t getLoopCountForCollapseAndTile(const Fortran::parser::AccClauseList &);
/// Checks whether the current insertion point is inside OpenACC loop.
bool isInOpenACCLoop(fir::FirOpBuilder &);
/// Checks whether the current insertion point is inside OpenACC compute
/// construct.
bool isInsideOpenACCComputeConstruct(fir::FirOpBuilder &);
void setInsertionPointAfterOpenACCLoopIfInside(fir::FirOpBuilder &);
void genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &, mlir::Location);
/// Generates an OpenACC loop from a do construct in order to
/// properly capture the loop bounds, parallelism determination mode,
/// and to privatize the loop variables.
/// When the conversion is rejected, nullptr is returned.
mlir::Operation *genOpenACCLoopFromDoConstruct(
AbstractConverter &converter,
Fortran::semantics::SemanticsContext &semanticsContext,
Fortran::lower::SymMap &localSymbols,
const Fortran::parser::DoConstruct &doConstruct, pft::Evaluation &eval);
} // namespace lower
} // namespace Fortran

View File

@ -2167,10 +2167,35 @@ private:
/// - structured and unstructured concurrent loops
void genFIR(const Fortran::parser::DoConstruct &doConstruct) {
setCurrentPositionAt(doConstruct);
// Collect loop nest information.
// Generate begin loop code directly for infinite and while loops.
Fortran::lower::pft::Evaluation &eval = getEval();
bool unstructuredContext = eval.lowerAsUnstructured();
// Loops with induction variables inside OpenACC compute constructs
// need special handling to ensure that the IVs are privatized.
if (Fortran::lower::isInsideOpenACCComputeConstruct(*builder)) {
mlir::Operation *loopOp = Fortran::lower::genOpenACCLoopFromDoConstruct(
*this, bridge.getSemanticsContext(), localSymbols, doConstruct, eval);
bool success = loopOp != nullptr;
if (success) {
// Sanity check that the builder insertion point is inside the newly
// generated loop.
assert(
loopOp->getRegion(0).isAncestor(
builder->getInsertionPoint()->getBlock()->getParent()) &&
"builder insertion point is not inside the newly generated loop");
// Loop body code.
auto iter = eval.getNestedEvaluations().begin();
for (auto end = --eval.getNestedEvaluations().end(); iter != end;
++iter)
genFIR(*iter, unstructuredContext);
return;
}
// Fall back to normal loop handling.
}
// Collect loop nest information.
// Generate begin loop code directly for infinite and while loops.
Fortran::lower::pft::Evaluation &doStmtEval =
eval.getFirstNestedEvaluation();
auto *doStmt = doStmtEval.getIf<Fortran::parser::NonLabelDoStmt>();
@ -3124,7 +3149,7 @@ private:
Fortran::lower::pft::Evaluation *curEval = &getEval();
if (accLoop || accCombined) {
int64_t loopCount;
uint64_t loopCount;
if (accLoop) {
const Fortran::parser::AccBeginLoopDirective &beginLoopDir =
std::get<Fortran::parser::AccBeginLoopDirective>(accLoop->t);
@ -3142,7 +3167,7 @@ private:
if (curEval->lowerAsStructured()) {
curEval = &curEval->getFirstNestedEvaluation();
for (int64_t i = 1; i < loopCount; i++)
for (uint64_t i = 1; i < loopCount; i++)
curEval = &*std::next(curEval->getNestedEvaluations().begin());
}
}

View File

@ -36,6 +36,7 @@
#include "mlir/IR/MLIRContext.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/Frontend/OpenACC/ACC.h.inc"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@ -2142,6 +2143,168 @@ static void determineDefaultLoopParMode(
}
}
// Extract loop bounds, steps, induction variables, and privatization info
// for both DO CONCURRENT and regular do loops
static void processDoLoopBounds(
Fortran::lower::AbstractConverter &converter,
mlir::Location currentLocation, Fortran::lower::StatementContext &stmtCtx,
fir::FirOpBuilder &builder,
const Fortran::parser::DoConstruct &outerDoConstruct,
Fortran::lower::pft::Evaluation &eval,
llvm::SmallVector<mlir::Value> &lowerbounds,
llvm::SmallVector<mlir::Value> &upperbounds,
llvm::SmallVector<mlir::Value> &steps,
llvm::SmallVector<mlir::Value> &privateOperands,
llvm::SmallVector<mlir::Value> &ivPrivate,
llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
llvm::SmallVector<mlir::Type> &ivTypes,
llvm::SmallVector<mlir::Location> &ivLocs,
llvm::SmallVector<bool> &inclusiveBounds,
llvm::SmallVector<mlir::Location> &locs, uint64_t loopsToProcess) {
assert(loopsToProcess > 0 && "expect at least one loop");
locs.push_back(currentLocation); // Location of the directive
Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation();
bool isDoConcurrent = outerDoConstruct.IsDoConcurrent();
if (isDoConcurrent) {
locs.push_back(converter.genLocation(
Fortran::parser::FindSourceLocation(outerDoConstruct)));
const Fortran::parser::LoopControl *loopControl =
&*outerDoConstruct.GetLoopControl();
const auto &concurrent =
std::get<Fortran::parser::LoopControl::Concurrent>(loopControl->u);
if (!std::get<std::list<Fortran::parser::LocalitySpec>>(concurrent.t)
.empty())
TODO(currentLocation, "DO CONCURRENT with locality spec inside ACC");
const auto &concurrentHeader =
std::get<Fortran::parser::ConcurrentHeader>(concurrent.t);
const auto &controls =
std::get<std::list<Fortran::parser::ConcurrentControl>>(
concurrentHeader.t);
for (const auto &control : controls) {
lowerbounds.push_back(fir::getBase(converter.genExprValue(
*Fortran::semantics::GetExpr(std::get<1>(control.t)), stmtCtx)));
upperbounds.push_back(fir::getBase(converter.genExprValue(
*Fortran::semantics::GetExpr(std::get<2>(control.t)), stmtCtx)));
if (const auto &expr =
std::get<std::optional<Fortran::parser::ScalarIntExpr>>(
control.t))
steps.push_back(fir::getBase(converter.genExprValue(
*Fortran::semantics::GetExpr(*expr), stmtCtx)));
else // If `step` is not present, assume it is `1`.
steps.push_back(builder.createIntegerConstant(
currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
const auto &name = std::get<Fortran::parser::Name>(control.t);
privatizeIv(converter, *name.symbol, currentLocation, ivTypes, ivLocs,
privateOperands, ivPrivate, privatizationRecipes,
isDoConcurrent);
inclusiveBounds.push_back(true);
}
} else {
for (uint64_t i = 0; i < loopsToProcess; ++i) {
const Fortran::parser::LoopControl *loopControl;
if (i == 0) {
loopControl = &*outerDoConstruct.GetLoopControl();
locs.push_back(converter.genLocation(
Fortran::parser::FindSourceLocation(outerDoConstruct)));
} else {
auto *doCons = crtEval->getIf<Fortran::parser::DoConstruct>();
assert(doCons && "expect do construct");
loopControl = &*doCons->GetLoopControl();
locs.push_back(converter.genLocation(
Fortran::parser::FindSourceLocation(*doCons)));
}
const Fortran::parser::LoopControl::Bounds *bounds =
std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u);
assert(bounds && "Expected bounds on the loop construct");
lowerbounds.push_back(fir::getBase(converter.genExprValue(
*Fortran::semantics::GetExpr(bounds->lower), stmtCtx)));
upperbounds.push_back(fir::getBase(converter.genExprValue(
*Fortran::semantics::GetExpr(bounds->upper), stmtCtx)));
if (bounds->step)
steps.push_back(fir::getBase(converter.genExprValue(
*Fortran::semantics::GetExpr(bounds->step), stmtCtx)));
else // If `step` is not present, assume it is `1`.
steps.push_back(builder.createIntegerConstant(
currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
Fortran::semantics::Symbol &ivSym =
bounds->name.thing.symbol->GetUltimate();
privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs,
privateOperands, ivPrivate, privatizationRecipes);
inclusiveBounds.push_back(true);
if (i < loopsToProcess - 1)
crtEval = &*std::next(crtEval->getNestedEvaluations().begin());
}
}
}
static mlir::acc::LoopOp
buildACCLoopOp(Fortran::lower::AbstractConverter &converter,
mlir::Location currentLocation,
Fortran::semantics::SemanticsContext &semanticsContext,
Fortran::lower::StatementContext &stmtCtx,
const Fortran::parser::DoConstruct &outerDoConstruct,
Fortran::lower::pft::Evaluation &eval,
llvm::SmallVector<mlir::Value> &privateOperands,
llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
llvm::SmallVector<mlir::Value> &gangOperands,
llvm::SmallVector<mlir::Value> &workerNumOperands,
llvm::SmallVector<mlir::Value> &vectorOperands,
llvm::SmallVector<mlir::Value> &tileOperands,
llvm::SmallVector<mlir::Value> &cacheOperands,
llvm::SmallVector<mlir::Value> &reductionOperands,
llvm::SmallVector<mlir::Type> &retTy, mlir::Value yieldValue,
uint64_t loopsToProcess) {
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
llvm::SmallVector<mlir::Value> ivPrivate;
llvm::SmallVector<mlir::Type> ivTypes;
llvm::SmallVector<mlir::Location> ivLocs;
llvm::SmallVector<bool> inclusiveBounds;
llvm::SmallVector<mlir::Location> locs;
llvm::SmallVector<mlir::Value> lowerbounds, upperbounds, steps;
// Look at the do/do concurrent loops to extract bounds information.
processDoLoopBounds(converter, currentLocation, stmtCtx, builder,
outerDoConstruct, eval, lowerbounds, upperbounds, steps,
privateOperands, ivPrivate, privatizationRecipes, ivTypes,
ivLocs, inclusiveBounds, locs, loopsToProcess);
// Prepare the operand segment size attribute and the operands value range.
llvm::SmallVector<mlir::Value> operands;
llvm::SmallVector<int32_t> operandSegments;
addOperands(operands, operandSegments, lowerbounds);
addOperands(operands, operandSegments, upperbounds);
addOperands(operands, operandSegments, steps);
addOperands(operands, operandSegments, gangOperands);
addOperands(operands, operandSegments, workerNumOperands);
addOperands(operands, operandSegments, vectorOperands);
addOperands(operands, operandSegments, tileOperands);
addOperands(operands, operandSegments, cacheOperands);
addOperands(operands, operandSegments, privateOperands);
addOperands(operands, operandSegments, reductionOperands);
auto loopOp = createRegionOp<mlir::acc::LoopOp, mlir::acc::YieldOp>(
builder, builder.getFusedLoc(locs), currentLocation, eval, operands,
operandSegments, /*outerCombined=*/false, retTy, yieldValue, ivTypes,
ivLocs);
for (auto [arg, value] : llvm::zip(
loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate))
fir::StoreOp::create(builder, currentLocation, arg, value);
loopOp.setInclusiveUpperbound(inclusiveBounds);
return loopOp;
}
static mlir::acc::LoopOp createLoopOp(
Fortran::lower::AbstractConverter &converter,
mlir::Location currentLocation,
@ -2154,9 +2317,9 @@ static mlir::acc::LoopOp createLoopOp(
std::nullopt,
bool needEarlyReturnHandling = false) {
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
llvm::SmallVector<mlir::Value> tileOperands, privateOperands, ivPrivate,
llvm::SmallVector<mlir::Value> tileOperands, privateOperands,
reductionOperands, cacheOperands, vectorOperands, workerNumOperands,
gangOperands, lowerbounds, upperbounds, steps;
gangOperands;
llvm::SmallVector<mlir::Attribute> privatizationRecipes, reductionRecipes;
llvm::SmallVector<int32_t> tileOperandsSegments, gangOperandsSegments;
llvm::SmallVector<int64_t> collapseValues;
@ -2325,107 +2488,6 @@ static mlir::acc::LoopOp createLoopOp(
}
}
llvm::SmallVector<mlir::Type> ivTypes;
llvm::SmallVector<mlir::Location> ivLocs;
llvm::SmallVector<bool> inclusiveBounds;
llvm::SmallVector<mlir::Location> locs;
locs.push_back(currentLocation); // Location of the directive
Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation();
bool isDoConcurrent = outerDoConstruct.IsDoConcurrent();
if (isDoConcurrent) {
locs.push_back(converter.genLocation(
Fortran::parser::FindSourceLocation(outerDoConstruct)));
const Fortran::parser::LoopControl *loopControl =
&*outerDoConstruct.GetLoopControl();
const auto &concurrent =
std::get<Fortran::parser::LoopControl::Concurrent>(loopControl->u);
if (!std::get<std::list<Fortran::parser::LocalitySpec>>(concurrent.t)
.empty())
TODO(currentLocation, "DO CONCURRENT with locality spec");
const auto &concurrentHeader =
std::get<Fortran::parser::ConcurrentHeader>(concurrent.t);
const auto &controls =
std::get<std::list<Fortran::parser::ConcurrentControl>>(
concurrentHeader.t);
for (const auto &control : controls) {
lowerbounds.push_back(fir::getBase(converter.genExprValue(
*Fortran::semantics::GetExpr(std::get<1>(control.t)), stmtCtx)));
upperbounds.push_back(fir::getBase(converter.genExprValue(
*Fortran::semantics::GetExpr(std::get<2>(control.t)), stmtCtx)));
if (const auto &expr =
std::get<std::optional<Fortran::parser::ScalarIntExpr>>(
control.t))
steps.push_back(fir::getBase(converter.genExprValue(
*Fortran::semantics::GetExpr(*expr), stmtCtx)));
else // If `step` is not present, assume it is `1`.
steps.push_back(builder.createIntegerConstant(
currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
const auto &name = std::get<Fortran::parser::Name>(control.t);
privatizeIv(converter, *name.symbol, currentLocation, ivTypes, ivLocs,
privateOperands, ivPrivate, privatizationRecipes,
isDoConcurrent);
inclusiveBounds.push_back(true);
}
} else {
int64_t loopCount =
Fortran::lower::getLoopCountForCollapseAndTile(accClauseList);
for (unsigned i = 0; i < loopCount; ++i) {
const Fortran::parser::LoopControl *loopControl;
if (i == 0) {
loopControl = &*outerDoConstruct.GetLoopControl();
locs.push_back(converter.genLocation(
Fortran::parser::FindSourceLocation(outerDoConstruct)));
} else {
auto *doCons = crtEval->getIf<Fortran::parser::DoConstruct>();
assert(doCons && "expect do construct");
loopControl = &*doCons->GetLoopControl();
locs.push_back(converter.genLocation(
Fortran::parser::FindSourceLocation(*doCons)));
}
const Fortran::parser::LoopControl::Bounds *bounds =
std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u);
assert(bounds && "Expected bounds on the loop construct");
lowerbounds.push_back(fir::getBase(converter.genExprValue(
*Fortran::semantics::GetExpr(bounds->lower), stmtCtx)));
upperbounds.push_back(fir::getBase(converter.genExprValue(
*Fortran::semantics::GetExpr(bounds->upper), stmtCtx)));
if (bounds->step)
steps.push_back(fir::getBase(converter.genExprValue(
*Fortran::semantics::GetExpr(bounds->step), stmtCtx)));
else // If `step` is not present, assume it is `1`.
steps.push_back(builder.createIntegerConstant(
currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
Fortran::semantics::Symbol &ivSym =
bounds->name.thing.symbol->GetUltimate();
privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs,
privateOperands, ivPrivate, privatizationRecipes);
inclusiveBounds.push_back(true);
if (i < loopCount - 1)
crtEval = &*std::next(crtEval->getNestedEvaluations().begin());
}
}
// Prepare the operand segment size attribute and the operands value range.
llvm::SmallVector<mlir::Value> operands;
llvm::SmallVector<int32_t> operandSegments;
addOperands(operands, operandSegments, lowerbounds);
addOperands(operands, operandSegments, upperbounds);
addOperands(operands, operandSegments, steps);
addOperands(operands, operandSegments, gangOperands);
addOperands(operands, operandSegments, workerNumOperands);
addOperands(operands, operandSegments, vectorOperands);
addOperands(operands, operandSegments, tileOperands);
addOperands(operands, operandSegments, cacheOperands);
addOperands(operands, operandSegments, privateOperands);
addOperands(operands, operandSegments, reductionOperands);
llvm::SmallVector<mlir::Type> retTy;
mlir::Value yieldValue;
if (needEarlyReturnHandling) {
@ -2434,16 +2496,13 @@ static mlir::acc::LoopOp createLoopOp(
retTy.push_back(i1Ty);
}
auto loopOp = createRegionOp<mlir::acc::LoopOp, mlir::acc::YieldOp>(
builder, builder.getFusedLoc(locs), currentLocation, eval, operands,
operandSegments, /*outerCombined=*/false, retTy, yieldValue, ivTypes,
ivLocs);
for (auto [arg, value] : llvm::zip(
loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate))
fir::StoreOp::create(builder, currentLocation, arg, value);
loopOp.setInclusiveUpperbound(inclusiveBounds);
uint64_t loopsToProcess =
Fortran::lower::getLoopCountForCollapseAndTile(accClauseList);
auto loopOp = buildACCLoopOp(
converter, currentLocation, semanticsContext, stmtCtx, outerDoConstruct,
eval, privateOperands, privatizationRecipes, gangOperands,
workerNumOperands, vectorOperands, tileOperands, cacheOperands,
reductionOperands, retTy, yieldValue, loopsToProcess);
if (!gangDeviceTypes.empty())
loopOp.setGangAttr(builder.getArrayAttr(gangDeviceTypes));
@ -4899,6 +4958,12 @@ bool Fortran::lower::isInOpenACCLoop(fir::FirOpBuilder &builder) {
return false;
}
bool Fortran::lower::isInsideOpenACCComputeConstruct(
fir::FirOpBuilder &builder) {
return mlir::isa_and_nonnull<ACC_COMPUTE_CONSTRUCT_OPS>(
mlir::acc::getEnclosingComputeOp(builder.getRegion()));
}
void Fortran::lower::setInsertionPointAfterOpenACCLoopIfInside(
fir::FirOpBuilder &builder) {
if (auto loopOp =
@ -4913,10 +4978,10 @@ void Fortran::lower::genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &builder,
mlir::acc::YieldOp::create(builder, loc, yieldValue);
}
int64_t Fortran::lower::getLoopCountForCollapseAndTile(
uint64_t Fortran::lower::getLoopCountForCollapseAndTile(
const Fortran::parser::AccClauseList &clauseList) {
int64_t collapseLoopCount = 1;
int64_t tileLoopCount = 1;
uint64_t collapseLoopCount = 1;
uint64_t tileLoopCount = 1;
for (const Fortran::parser::AccClause &clause : clauseList.v) {
if (const auto *collapseClause =
std::get_if<Fortran::parser::AccClause::Collapse>(&clause.u)) {
@ -4935,3 +5000,101 @@ int64_t Fortran::lower::getLoopCountForCollapseAndTile(
return tileLoopCount;
return collapseLoopCount;
}
/// Create an ACC loop operation for a DO construct when inside ACC compute
/// constructs This serves as a bridge between regular DO construct handling and
/// ACC loop creation
mlir::Operation *Fortran::lower::genOpenACCLoopFromDoConstruct(
AbstractConverter &converter,
Fortran::semantics::SemanticsContext &semanticsContext,
Fortran::lower::SymMap &localSymbols,
const Fortran::parser::DoConstruct &doConstruct, pft::Evaluation &eval) {
// Only convert loops which have induction variables that need privatized.
if (!doConstruct.IsDoNormal() && !doConstruct.IsDoConcurrent())
return nullptr;
// If the evaluation is unstructured, then we cannot convert the loop
// because acc loop does not have an unstructured form.
// TODO: There may be other strategies that can be employed such
// as generating acc.private for the loop variables without attaching
// them to acc.loop.
// For now - generate a not-yet-implemented message because without
// privatizing the induction variable, the loop may not execute correctly.
// Only do this for `acc kernels` because in `acc parallel`, scalars end
// up as implicitly firstprivate.
if (eval.lowerAsUnstructured()) {
if (mlir::isa_and_present<mlir::acc::KernelsOp>(
mlir::acc::getEnclosingComputeOp(
converter.getFirOpBuilder().getRegion())))
TODO(converter.getCurrentLocation(),
"unstructured do loop in acc kernels");
return nullptr;
}
// Open up a new scope for the loop variables.
localSymbols.pushScope();
auto scopeGuard = llvm::make_scope_exit([&]() { localSymbols.popScope(); });
// Prepare empty operand vectors since there are no associated `acc loop`
// clauses with the Fortran do loops being handled here.
llvm::SmallVector<mlir::Value> privateOperands, gangOperands,
workerNumOperands, vectorOperands, tileOperands, cacheOperands,
reductionOperands;
llvm::SmallVector<mlir::Attribute> privatizationRecipes;
llvm::SmallVector<mlir::Type> retTy;
mlir::Value yieldValue;
uint64_t loopsToProcess = 1; // Single loop construct
// Use same mechanism that handles `acc loop` contained do loops to handle
// the implicit loop case.
Fortran::lower::StatementContext stmtCtx;
auto loopOp = buildACCLoopOp(
converter, converter.getCurrentLocation(), semanticsContext, stmtCtx,
doConstruct, eval, privateOperands, privatizationRecipes, gangOperands,
workerNumOperands, vectorOperands, tileOperands, cacheOperands,
reductionOperands, retTy, yieldValue, loopsToProcess);
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
if (!privatizationRecipes.empty())
loopOp.setPrivatizationRecipesAttr(mlir::ArrayAttr::get(
converter.getFirOpBuilder().getContext(), privatizationRecipes));
// Normal do loops which are not annotated with `acc loop` should be
// left for analysis by marking with `auto`. This is the case even in the case
// of `acc parallel` region because the normal rules of applying `independent`
// is only for loops marked with `acc loop`.
// For do concurrent loops, the spec says in section 2.17.2:
// "When do concurrent appears without a loop construct in a kernels construct
// it is treated as if it is annotated with loop auto. If it appears in a
// parallel construct or an accelerator routine then it is treated as if it is
// annotated with loop independent."
// So this means that in all cases we mark with `auto` unless it is a
// `do concurrent` in an `acc parallel` construct or it must be `seq` because
// it is in an `acc serial` construct.
mlir::Operation *accRegionOp =
mlir::acc::getEnclosingComputeOp(converter.getFirOpBuilder().getRegion());
mlir::acc::LoopParMode parMode =
mlir::isa_and_present<mlir::acc::ParallelOp>(accRegionOp) &&
doConstruct.IsDoConcurrent()
? mlir::acc::LoopParMode::loop_independent
: mlir::isa_and_present<mlir::acc::SerialOp>(accRegionOp)
? mlir::acc::LoopParMode::loop_seq
: mlir::acc::LoopParMode::loop_auto;
// Set the parallel mode based on the computed parMode
auto deviceNoneAttr = mlir::acc::DeviceTypeAttr::get(
builder.getContext(), mlir::acc::DeviceType::None);
auto arrOfDeviceNone =
mlir::ArrayAttr::get(builder.getContext(), deviceNoneAttr);
if (parMode == mlir::acc::LoopParMode::loop_independent) {
loopOp.setIndependentAttr(arrOfDeviceNone);
} else if (parMode == mlir::acc::LoopParMode::loop_seq) {
loopOp.setSeqAttr(arrOfDeviceNone);
} else if (parMode == mlir::acc::LoopParMode::loop_auto) {
loopOp.setAuto_Attr(arrOfDeviceNone);
} else {
llvm_unreachable("Unexpected loop par mode");
}
return loopOp;
}

View File

@ -0,0 +1,91 @@
! RUN: split-file %s %t
! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/do_loop_with_stop.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK1
! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/do_loop_with_cycle_goto.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK2
! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/nested_goto_loop.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK3
! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/nested_loop_with_inner_goto.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK4
//--- do_loop_with_stop.f90
subroutine do_loop_with_stop()
integer :: i
integer, parameter :: n = 10
real, dimension(n) :: a, b
!$acc kernels
do i = 1, n
a(i) = b(i) + 1.0
if (i == 5) stop
end do
!$acc end kernels
! CHECK1: not yet implemented: unstructured do loop in acc kernels
end subroutine
//--- do_loop_with_cycle_goto.f90
subroutine do_loop_with_cycle_goto()
integer :: i
integer, parameter :: n = 10
real, dimension(n) :: a, b
! Do loop with cycle and goto - unstructured control flow is not converted.
!$acc kernels
do i = 1, n
if (i == 3) cycle
a(i) = b(i) + 1.0
if (i == 7) goto 200
a(i) = a(i) * 2.0
end do
200 continue
!$acc end kernels
! CHECK2: not yet implemented: unstructured do loop in acc kernels
end subroutine
//--- nested_goto_loop.f90
subroutine nested_goto_loop()
integer :: i, j
integer, parameter :: n = 10, m = 5
real, dimension(n,m) :: a, b
! Nested loop with goto from inner to outer - should NOT convert to acc.loop
!$acc kernels
do i = 1, n
do j = 1, m
a(i,j) = b(i,j) + 1.0
if (i * j > 20) goto 300 ! Exit both loops
end do
end do
300 continue
!$acc end kernels
! CHECK3: not yet implemented: unstructured do loop in acc kernels
end subroutine
//--- nested_loop_with_inner_goto.f90
subroutine nested_loop_with_inner_goto()
integer :: ii = 0, jj = 0
integer, parameter :: nn = 3
real, dimension(nn, nn) :: aa
aa = -1
! Nested loop with goto from inner loop - unstructured control flow is not converted.
!$acc kernels
do ii = 1, nn
do jj = 1, nn
if (jj > 1) goto 300
aa(jj, ii) = 1337
end do
300 continue
end do
!$acc end kernels
! CHECK4: not yet implemented: unstructured do loop in acc kernels
end subroutine

View File

@ -0,0 +1,267 @@
! This test checks lowering of Fortran do loops and do concurrent loops to OpenACC loop constructs.
! Tests the new functionality that converts Fortran iteration constructs to acc.loop with proper IV handling.
! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
! CHECK-LABEL: func.func @_QPbasic_do_loop
subroutine basic_do_loop()
integer :: i
integer, parameter :: n = 10
real, dimension(n) :: a, b
! Basic do loop that should be converted to acc.loop
!$acc kernels
do i = 1, n
a(i) = b(i) + 1.0
end do
!$acc end kernels
! CHECK: acc.kernels {
! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
! CHECK: acc.yield
! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
end subroutine
! CHECK-LABEL: func.func @_QPbasic_do_concurrent
subroutine basic_do_concurrent()
integer :: i
integer, parameter :: n = 10
real, dimension(n) :: a, b
! Basic do concurrent loop
!$acc kernels
do concurrent (i = 1:n)
a(i) = b(i) + 1.0
end do
!$acc end kernels
! CHECK: acc.kernels {
! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
! CHECK: acc.yield
! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
end subroutine
! CHECK-LABEL: func.func @_QPbasic_do_loop_parallel
subroutine basic_do_loop_parallel()
integer :: i
integer, parameter :: n = 10
real, dimension(n) :: a, b
! Basic do loop with acc parallel that should be converted to acc.loop
!$acc parallel
do i = 1, n
a(i) = b(i) + 1.0
end do
!$acc end parallel
! CHECK: acc.parallel {
! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
! CHECK: acc.yield
! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
end subroutine
! CHECK-LABEL: func.func @_QPbasic_do_loop_serial
subroutine basic_do_loop_serial()
integer :: i
integer, parameter :: n = 10
real, dimension(n) :: a, b
! Basic do loop with acc serial that should be converted to acc.loop
!$acc serial
do i = 1, n
a(i) = b(i) + 1.0
end do
!$acc end serial
! CHECK: acc.serial {
! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
! CHECK: acc.yield
! CHECK: attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
end subroutine
! CHECK-LABEL: func.func @_QPbasic_do_concurrent_parallel
subroutine basic_do_concurrent_parallel()
integer :: i
integer, parameter :: n = 10
real, dimension(n) :: a, b
! Basic do concurrent loop with acc parallel
!$acc parallel
do concurrent (i = 1:n)
a(i) = b(i) + 1.0
end do
!$acc end parallel
! CHECK: acc.parallel {
! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
! CHECK: acc.yield
! CHECK: attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
end subroutine
! CHECK-LABEL: func.func @_QPbasic_do_concurrent_serial
subroutine basic_do_concurrent_serial()
integer :: i
integer, parameter :: n = 10
real, dimension(n) :: a, b
! Basic do concurrent loop with acc serial
!$acc serial
do concurrent (i = 1:n)
a(i) = b(i) + 1.0
end do
!$acc end serial
! CHECK: acc.serial {
! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
! CHECK: acc.yield
! CHECK: attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
end subroutine
! CHECK-LABEL: func.func @_QPmulti_dimension_do_concurrent
subroutine multi_dimension_do_concurrent()
integer :: i, j, k
integer, parameter :: n = 10, m = 20, l = 5
real, dimension(n,m,l) :: a, b
! Multi-dimensional do concurrent with multiple iteration variables
!$acc kernels
do concurrent (i = 1:n, j = 1:m, k = 1:l)
a(i,j,k) = b(i,j,k) * 2.0
end do
!$acc end kernels
! CHECK: acc.kernels {
! CHECK: acc.loop {{.*}} control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%{{.*}}, %{{.*}}, %{{.*}} : i32, i32, i32) step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32)
! CHECK: acc.yield
! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>}
end subroutine
! CHECK-LABEL: func.func @_QPnested_do_loops
subroutine nested_do_loops()
integer :: i, j
integer, parameter :: n = 10, m = 20
real, dimension(n,m) :: a, b
! Nested do loops
!$acc kernels
do i = 1, n
do j = 1, m
a(i,j) = b(i,j) + i + j
end do
end do
!$acc end kernels
! CHECK: acc.kernels {
! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
! CHECK: acc.yield
! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
! CHECK: acc.yield
! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
end subroutine
! CHECK-LABEL: func.func @_QPvariable_bounds_and_step
subroutine variable_bounds_and_step(n, start_val, step_val)
integer, intent(in) :: n, start_val, step_val
integer :: i
real, dimension(n) :: a, b
! Do loop with variable bounds and step
!$acc kernels
do i = start_val, n, step_val
a(i) = b(i) * 2.0
end do
!$acc end kernels
! CHECK: acc.kernels {
! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
! CHECK: acc.yield
! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
end subroutine
! CHECK-LABEL: func.func @_QPdifferent_iv_types
subroutine different_iv_types()
integer(kind=8) :: i8
integer(kind=4) :: i4
integer(kind=2) :: i2
integer, parameter :: n = 10
real, dimension(n) :: a, b, c, d
! Test different iteration variable types
!$acc kernels
do i8 = 1_8, int(n,8)
a(i8) = b(i8) + 1.0
end do
!$acc end kernels
!$acc kernels
do i4 = 1, n
b(i4) = c(i4) + 1.0
end do
!$acc end kernels
!$acc kernels
do i2 = 1_2, int(n,2)
c(i2) = d(i2) + 1.0
end do
!$acc end kernels
! CHECK: acc.kernels {
! CHECK: acc.loop {{.*}} control(%{{.*}} : i64) = (%{{.*}} : i64) to (%{{.*}} : i64) step (%{{.*}} : i64)
! CHECK: acc.kernels {
! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
! CHECK: acc.kernels {
! CHECK: acc.loop {{.*}} control(%{{.*}} : i16) = (%{{.*}} : i16) to (%{{.*}} : i16) step (%{{.*}} : i16)
end subroutine
! -----------------------------------------------------------------------------------------
! Tests for loops that should NOT be converted to acc.loop due to unstructured control flow
! CHECK-LABEL: func.func @_QPinfinite_loop_no_iv
subroutine infinite_loop_no_iv()
integer :: i
logical :: condition
! Infinite loop with no induction variable - should NOT convert to acc.loop
!$acc kernels
do
i = i + 1
if (i > 100) exit
end do
!$acc end kernels
! CHECK: acc.kernels {
! CHECK-NOT: acc.loop
end subroutine
! CHECK-LABEL: func.func @_QPwhile_like_loop
subroutine while_like_loop()
integer :: i
logical :: condition
i = 1
condition = .true.
! While-like infinite loop - should NOT convert to acc.loop
!$acc kernels
do while (condition)
i = i + 1
if (i > 100) condition = .false.
end do
!$acc end kernels
! CHECK: acc.kernels {
! CHECK-NOT: acc.loop
end subroutine

View File

@ -134,6 +134,24 @@ def OpenACC_VariableTypeCategory : I32BitEnumAttr<
let printBitEnumPrimaryGroups = 1;
}
// These are parallelism determination modes for `acc loop`.
// In the enum names, we use the "loop_" prefix because "auto" is
// a language keyword - and thus for consistency all other cases
// do the same.
def OpenACC_LoopSeq : I32EnumAttrCase<"loop_seq", 0>;
def OpenACC_LoopAuto : I32EnumAttrCase<"loop_auto", 1>;
def OpenACC_LoopIndependent : I32EnumAttrCase<"loop_independent", 2>;
def OpenACC_LoopParMode : I32EnumAttr<
"LoopParMode",
"Encodes the options for loop parallelism determination mode",
[
OpenACC_LoopAuto, OpenACC_LoopIndependent,
OpenACC_LoopSeq]> {
let cppNamespace = "::mlir::acc";
let genSpecializedAttr = 0;
}
// Type used in operation below.
def IntOrIndex : AnyTypeOf<[AnyInteger, Index]>;
@ -2373,6 +2391,11 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
// Return whether this LoopOp has a gang, worker, or vector applying to the
// 'default'/None device-type.
bool hasDefaultGangWorkerVector();
// Used to obtain the parallelism mode for the requested device type.
// This first checks if the mode is set for the device_type requested.
// And if not, it returns the non-device_type mode.
LoopParMode getDefaultOrDeviceTypeParallelism(DeviceType);
}];
let hasCustomAssemblyFormat = 1;
@ -2404,6 +2427,53 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
}];
let hasVerifier = 1;
let builders = [
OpBuilder<(ins "::mlir::ValueRange":$lowerbounds,
"::mlir::ValueRange":$upperbounds,
"::mlir::ValueRange":$steps,
"LoopParMode":$parMode), [{
auto deviceNoneAttr = mlir::acc::DeviceTypeAttr::get(
$_builder.getContext(), mlir::acc::DeviceType::None);
auto arrOfDeviceNone = mlir::ArrayAttr::get(
$_builder.getContext(), deviceNoneAttr);
build($_builder, $_state,
/*results=*/{},
/*lowerbound=*/lowerbounds,
/*upperbound=*/upperbounds,
/*step=*/steps,
/*inclusiveUpperbound=*/nullptr,
/*collapse=*/nullptr,
/*collapseDeviceType=*/nullptr,
/*gangOperands=*/{},
/*gangOperandsArgType=*/nullptr,
/*gangOperandsSegments=*/nullptr,
/*gangOperandsDeviceType=*/nullptr,
/*workerNumOperands=*/{},
/*workerNumOperandsDeviceType=*/nullptr,
/*vectorOperands=*/{},
/*vectorOperandsDeviceType=*/nullptr,
/*seq=*/parMode == LoopParMode::loop_seq ?
arrOfDeviceNone : nullptr,
/*independent=*/parMode == LoopParMode::loop_independent ?
arrOfDeviceNone : nullptr,
/*auto_=*/parMode == LoopParMode::loop_auto ?
arrOfDeviceNone : nullptr,
/*gang=*/nullptr,
/*worker=*/nullptr,
/*vector=*/nullptr,
/*tileOperands=*/{},
/*tileOperandsSegments=*/nullptr,
/*tileOperandsDeviceType=*/nullptr,
/*cacheOperands=*/{},
/*privateOperands=*/{},
/*privatizationRecipes=*/nullptr,
/*reductionOperands=*/{},
/*reductionRecipes=*/nullptr,
/*combined=*/nullptr);
}]
>
];
}
// Yield operation for the acc.loop and acc.parallel operations.

View File

@ -2957,6 +2957,23 @@ bool acc::LoopOp::hasDefaultGangWorkerVector() {
getGangValue(GangArgType::Dim) || getGangValue(GangArgType::Static);
}
acc::LoopParMode
acc::LoopOp::getDefaultOrDeviceTypeParallelism(DeviceType deviceType) {
if (hasSeq(deviceType))
return LoopParMode::loop_seq;
if (hasAuto(deviceType))
return LoopParMode::loop_auto;
if (hasIndependent(deviceType))
return LoopParMode::loop_independent;
if (hasSeq())
return LoopParMode::loop_seq;
if (hasAuto())
return LoopParMode::loop_auto;
assert(hasIndependent() &&
"loop must have default auto, seq, or independent");
return LoopParMode::loop_independent;
}
void acc::LoopOp::addGangOperands(
MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes,
llvm::ArrayRef<GangArgType> argTypes, mlir::ValueRange values) {