[flang][acc] Lower do and do concurrent loops specially in acc regions (#149614)

When OpenACC is enabled and Fortran loops are annotated with `acc loop`, they are lowered to `acc.loop` operation. And rest of the contained loops use the normal FIR lowering path. Hovever, the OpenACC specification has special provisions related to contained loops and their induction variable. In order to adhere to this, we convert all valid contained loops to `acc.loop` in order to store this information appropriately. The provisions in the spec that motivated this change (line numbers are from OpenACC 3.4): - 1353 Loop variables in Fortran do statements within a compute construct are predetermined to be private to the thread that executes the loop. - 3783 When do concurrent appears without a loop construct in a kernels construct it is treated as if it is annotated with loop auto. If it appears in a parallel construct or an accelerator routine then it is treated as if it is annotated with loop independent. By valid loops - we convert do loops and do concurrent loops which have induction variable. Loops which are unstructured are not handled.
2025-07-29 10:03:22 -07:00 · 2025-07-29 10:03:22 -07:00 · 4128cf3b26
commit 4128cf3b26
parent 2a3f72ee6e
7 changed files with 774 additions and 121 deletions
--- a/flang/include/flang/Lower/OpenACC.h
+++ b/flang/include/flang/Lower/OpenACC.h
@ -43,6 +43,7 @@ struct ProcedureDesignator;

 namespace parser {
 struct AccClauseList;
+struct DoConstruct;
 struct OpenACCConstruct;
 struct OpenACCDeclarativeConstruct;
 struct OpenACCRoutineConstruct;
@ -58,6 +59,7 @@ namespace lower {

 class AbstractConverter;
 class StatementContext;
+class SymMap;

 namespace pft {
 struct Evaluation;
@ -114,14 +116,32 @@ void attachDeclarePostDeallocAction(AbstractConverter &, fir::FirOpBuilder &,
 void genOpenACCTerminator(fir::FirOpBuilder &, mlir::Operation *,
                          mlir::Location);

-int64_t getLoopCountForCollapseAndTile(const Fortran::parser::AccClauseList &);
+/// Used to obtain the number of contained loops to look for
+/// since this is dependent on number of tile operands and collapse
+/// clause.
+uint64_t getLoopCountForCollapseAndTile(const Fortran::parser::AccClauseList &);

+/// Checks whether the current insertion point is inside OpenACC loop.
 bool isInOpenACCLoop(fir::FirOpBuilder &);

+/// Checks whether the current insertion point is inside OpenACC compute
+/// construct.
+bool isInsideOpenACCComputeConstruct(fir::FirOpBuilder &);
+
 void setInsertionPointAfterOpenACCLoopIfInside(fir::FirOpBuilder &);

 void genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &, mlir::Location);

+/// Generates an OpenACC loop from a do construct in order to
+/// properly capture the loop bounds, parallelism determination mode,
+/// and to privatize the loop variables.
+/// When the conversion is rejected, nullptr is returned.
+mlir::Operation *genOpenACCLoopFromDoConstruct(
+    AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semanticsContext,
+    Fortran::lower::SymMap &localSymbols,
+    const Fortran::parser::DoConstruct &doConstruct, pft::Evaluation &eval);
+
 } // namespace lower
 } // namespace Fortran

--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@ -2167,10 +2167,35 @@ private:
  ///  - structured and unstructured concurrent loops
  void genFIR(const Fortran::parser::DoConstruct &doConstruct) {
    setCurrentPositionAt(doConstruct);
-    // Collect loop nest information.
-    // Generate begin loop code directly for infinite and while loops.
    Fortran::lower::pft::Evaluation &eval = getEval();
    bool unstructuredContext = eval.lowerAsUnstructured();
+
+    // Loops with induction variables inside OpenACC compute constructs
+    // need special handling to ensure that the IVs are privatized.
+    if (Fortran::lower::isInsideOpenACCComputeConstruct(*builder)) {
+      mlir::Operation *loopOp = Fortran::lower::genOpenACCLoopFromDoConstruct(
+          *this, bridge.getSemanticsContext(), localSymbols, doConstruct, eval);
+      bool success = loopOp != nullptr;
+      if (success) {
+        // Sanity check that the builder insertion point is inside the newly
+        // generated loop.
+        assert(
+            loopOp->getRegion(0).isAncestor(
+                builder->getInsertionPoint()->getBlock()->getParent()) &&
+            "builder insertion point is not inside the newly generated loop");
+
+        // Loop body code.
+        auto iter = eval.getNestedEvaluations().begin();
+        for (auto end = --eval.getNestedEvaluations().end(); iter != end;
+             ++iter)
+          genFIR(*iter, unstructuredContext);
+        return;
+      }
+      // Fall back to normal loop handling.
+    }
+
+    // Collect loop nest information.
+    // Generate begin loop code directly for infinite and while loops.
    Fortran::lower::pft::Evaluation &doStmtEval =
        eval.getFirstNestedEvaluation();
    auto *doStmt = doStmtEval.getIf<Fortran::parser::NonLabelDoStmt>();
@ -3124,7 +3149,7 @@ private:
    Fortran::lower::pft::Evaluation *curEval = &getEval();

    if (accLoop || accCombined) {
-      int64_t loopCount;
+      uint64_t loopCount;
      if (accLoop) {
        const Fortran::parser::AccBeginLoopDirective &beginLoopDir =
            std::get<Fortran::parser::AccBeginLoopDirective>(accLoop->t);
@ -3142,7 +3167,7 @@ private:

      if (curEval->lowerAsStructured()) {
        curEval = &curEval->getFirstNestedEvaluation();
-        for (int64_t i = 1; i < loopCount; i++)
+        for (uint64_t i = 1; i < loopCount; i++)
          curEval = &*std::next(curEval->getNestedEvaluations().begin());
      }
    }
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@ -36,6 +36,7 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Frontend/OpenACC/ACC.h.inc"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@ -2142,6 +2143,168 @@ static void determineDefaultLoopParMode(
  }
 }

+// Extract loop bounds, steps, induction variables, and privatization info
+// for both DO CONCURRENT and regular do loops
+static void processDoLoopBounds(
+    Fortran::lower::AbstractConverter &converter,
+    mlir::Location currentLocation, Fortran::lower::StatementContext &stmtCtx,
+    fir::FirOpBuilder &builder,
+    const Fortran::parser::DoConstruct &outerDoConstruct,
+    Fortran::lower::pft::Evaluation &eval,
+    llvm::SmallVector<mlir::Value> &lowerbounds,
+    llvm::SmallVector<mlir::Value> &upperbounds,
+    llvm::SmallVector<mlir::Value> &steps,
+    llvm::SmallVector<mlir::Value> &privateOperands,
+    llvm::SmallVector<mlir::Value> &ivPrivate,
+    llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
+    llvm::SmallVector<mlir::Type> &ivTypes,
+    llvm::SmallVector<mlir::Location> &ivLocs,
+    llvm::SmallVector<bool> &inclusiveBounds,
+    llvm::SmallVector<mlir::Location> &locs, uint64_t loopsToProcess) {
+  assert(loopsToProcess > 0 && "expect at least one loop");
+  locs.push_back(currentLocation); // Location of the directive
+  Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation();
+  bool isDoConcurrent = outerDoConstruct.IsDoConcurrent();
+
+  if (isDoConcurrent) {
+    locs.push_back(converter.genLocation(
+        Fortran::parser::FindSourceLocation(outerDoConstruct)));
+    const Fortran::parser::LoopControl *loopControl =
+        &*outerDoConstruct.GetLoopControl();
+    const auto &concurrent =
+        std::get<Fortran::parser::LoopControl::Concurrent>(loopControl->u);
+    if (!std::get<std::list<Fortran::parser::LocalitySpec>>(concurrent.t)
+             .empty())
+      TODO(currentLocation, "DO CONCURRENT with locality spec inside ACC");
+
+    const auto &concurrentHeader =
+        std::get<Fortran::parser::ConcurrentHeader>(concurrent.t);
+    const auto &controls =
+        std::get<std::list<Fortran::parser::ConcurrentControl>>(
+            concurrentHeader.t);
+    for (const auto &control : controls) {
+      lowerbounds.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(std::get<1>(control.t)), stmtCtx)));
+      upperbounds.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(std::get<2>(control.t)), stmtCtx)));
+      if (const auto &expr =
+              std::get<std::optional<Fortran::parser::ScalarIntExpr>>(
+                  control.t))
+        steps.push_back(fir::getBase(converter.genExprValue(
+            *Fortran::semantics::GetExpr(*expr), stmtCtx)));
+      else // If `step` is not present, assume it is `1`.
+        steps.push_back(builder.createIntegerConstant(
+            currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
+
+      const auto &name = std::get<Fortran::parser::Name>(control.t);
+      privatizeIv(converter, *name.symbol, currentLocation, ivTypes, ivLocs,
+                  privateOperands, ivPrivate, privatizationRecipes,
+                  isDoConcurrent);
+
+      inclusiveBounds.push_back(true);
+    }
+  } else {
+    for (uint64_t i = 0; i < loopsToProcess; ++i) {
+      const Fortran::parser::LoopControl *loopControl;
+      if (i == 0) {
+        loopControl = &*outerDoConstruct.GetLoopControl();
+        locs.push_back(converter.genLocation(
+            Fortran::parser::FindSourceLocation(outerDoConstruct)));
+      } else {
+        auto *doCons = crtEval->getIf<Fortran::parser::DoConstruct>();
+        assert(doCons && "expect do construct");
+        loopControl = &*doCons->GetLoopControl();
+        locs.push_back(converter.genLocation(
+            Fortran::parser::FindSourceLocation(*doCons)));
+      }
+
+      const Fortran::parser::LoopControl::Bounds *bounds =
+          std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u);
+      assert(bounds && "Expected bounds on the loop construct");
+      lowerbounds.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(bounds->lower), stmtCtx)));
+      upperbounds.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(bounds->upper), stmtCtx)));
+      if (bounds->step)
+        steps.push_back(fir::getBase(converter.genExprValue(
+            *Fortran::semantics::GetExpr(bounds->step), stmtCtx)));
+      else // If `step` is not present, assume it is `1`.
+        steps.push_back(builder.createIntegerConstant(
+            currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
+
+      Fortran::semantics::Symbol &ivSym =
+          bounds->name.thing.symbol->GetUltimate();
+      privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs,
+                  privateOperands, ivPrivate, privatizationRecipes);
+
+      inclusiveBounds.push_back(true);
+
+      if (i < loopsToProcess - 1)
+        crtEval = &*std::next(crtEval->getNestedEvaluations().begin());
+    }
+  }
+}
+
+static mlir::acc::LoopOp
+buildACCLoopOp(Fortran::lower::AbstractConverter &converter,
+               mlir::Location currentLocation,
+               Fortran::semantics::SemanticsContext &semanticsContext,
+               Fortran::lower::StatementContext &stmtCtx,
+               const Fortran::parser::DoConstruct &outerDoConstruct,
+               Fortran::lower::pft::Evaluation &eval,
+               llvm::SmallVector<mlir::Value> &privateOperands,
+               llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
+               llvm::SmallVector<mlir::Value> &gangOperands,
+               llvm::SmallVector<mlir::Value> &workerNumOperands,
+               llvm::SmallVector<mlir::Value> &vectorOperands,
+               llvm::SmallVector<mlir::Value> &tileOperands,
+               llvm::SmallVector<mlir::Value> &cacheOperands,
+               llvm::SmallVector<mlir::Value> &reductionOperands,
+               llvm::SmallVector<mlir::Type> &retTy, mlir::Value yieldValue,
+               uint64_t loopsToProcess) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+
+  llvm::SmallVector<mlir::Value> ivPrivate;
+  llvm::SmallVector<mlir::Type> ivTypes;
+  llvm::SmallVector<mlir::Location> ivLocs;
+  llvm::SmallVector<bool> inclusiveBounds;
+  llvm::SmallVector<mlir::Location> locs;
+  llvm::SmallVector<mlir::Value> lowerbounds, upperbounds, steps;
+
+  // Look at the do/do concurrent loops to extract bounds information.
+  processDoLoopBounds(converter, currentLocation, stmtCtx, builder,
+                      outerDoConstruct, eval, lowerbounds, upperbounds, steps,
+                      privateOperands, ivPrivate, privatizationRecipes, ivTypes,
+                      ivLocs, inclusiveBounds, locs, loopsToProcess);
+
+  // Prepare the operand segment size attribute and the operands value range.
+  llvm::SmallVector<mlir::Value> operands;
+  llvm::SmallVector<int32_t> operandSegments;
+  addOperands(operands, operandSegments, lowerbounds);
+  addOperands(operands, operandSegments, upperbounds);
+  addOperands(operands, operandSegments, steps);
+  addOperands(operands, operandSegments, gangOperands);
+  addOperands(operands, operandSegments, workerNumOperands);
+  addOperands(operands, operandSegments, vectorOperands);
+  addOperands(operands, operandSegments, tileOperands);
+  addOperands(operands, operandSegments, cacheOperands);
+  addOperands(operands, operandSegments, privateOperands);
+  addOperands(operands, operandSegments, reductionOperands);
+
+  auto loopOp = createRegionOp<mlir::acc::LoopOp, mlir::acc::YieldOp>(
+      builder, builder.getFusedLoc(locs), currentLocation, eval, operands,
+      operandSegments, /*outerCombined=*/false, retTy, yieldValue, ivTypes,
+      ivLocs);
+
+  for (auto [arg, value] : llvm::zip(
+           loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate))
+    fir::StoreOp::create(builder, currentLocation, arg, value);
+
+  loopOp.setInclusiveUpperbound(inclusiveBounds);
+
+  return loopOp;
+}
+
 static mlir::acc::LoopOp createLoopOp(
    Fortran::lower::AbstractConverter &converter,
    mlir::Location currentLocation,
@ -2154,9 +2317,9 @@ static mlir::acc::LoopOp createLoopOp(
        std::nullopt,
    bool needEarlyReturnHandling = false) {
  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  llvm::SmallVector<mlir::Value> tileOperands, privateOperands, ivPrivate,
+  llvm::SmallVector<mlir::Value> tileOperands, privateOperands,
      reductionOperands, cacheOperands, vectorOperands, workerNumOperands,
-      gangOperands, lowerbounds, upperbounds, steps;
+      gangOperands;
  llvm::SmallVector<mlir::Attribute> privatizationRecipes, reductionRecipes;
  llvm::SmallVector<int32_t> tileOperandsSegments, gangOperandsSegments;
  llvm::SmallVector<int64_t> collapseValues;
@ -2325,107 +2488,6 @@ static mlir::acc::LoopOp createLoopOp(
    }
  }

-  llvm::SmallVector<mlir::Type> ivTypes;
-  llvm::SmallVector<mlir::Location> ivLocs;
-  llvm::SmallVector<bool> inclusiveBounds;
-  llvm::SmallVector<mlir::Location> locs;
-  locs.push_back(currentLocation); // Location of the directive
-  Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation();
-  bool isDoConcurrent = outerDoConstruct.IsDoConcurrent();
-  if (isDoConcurrent) {
-    locs.push_back(converter.genLocation(
-        Fortran::parser::FindSourceLocation(outerDoConstruct)));
-    const Fortran::parser::LoopControl *loopControl =
-        &*outerDoConstruct.GetLoopControl();
-    const auto &concurrent =
-        std::get<Fortran::parser::LoopControl::Concurrent>(loopControl->u);
-    if (!std::get<std::list<Fortran::parser::LocalitySpec>>(concurrent.t)
-             .empty())
-      TODO(currentLocation, "DO CONCURRENT with locality spec");
-
-    const auto &concurrentHeader =
-        std::get<Fortran::parser::ConcurrentHeader>(concurrent.t);
-    const auto &controls =
-        std::get<std::list<Fortran::parser::ConcurrentControl>>(
-            concurrentHeader.t);
-    for (const auto &control : controls) {
-      lowerbounds.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(std::get<1>(control.t)), stmtCtx)));
-      upperbounds.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(std::get<2>(control.t)), stmtCtx)));
-      if (const auto &expr =
-              std::get<std::optional<Fortran::parser::ScalarIntExpr>>(
-                  control.t))
-        steps.push_back(fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(*expr), stmtCtx)));
-      else // If `step` is not present, assume it is `1`.
-        steps.push_back(builder.createIntegerConstant(
-            currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
-
-      const auto &name = std::get<Fortran::parser::Name>(control.t);
-      privatizeIv(converter, *name.symbol, currentLocation, ivTypes, ivLocs,
-                  privateOperands, ivPrivate, privatizationRecipes,
-                  isDoConcurrent);
-
-      inclusiveBounds.push_back(true);
-    }
-  } else {
-    int64_t loopCount =
-        Fortran::lower::getLoopCountForCollapseAndTile(accClauseList);
-    for (unsigned i = 0; i < loopCount; ++i) {
-      const Fortran::parser::LoopControl *loopControl;
-      if (i == 0) {
-        loopControl = &*outerDoConstruct.GetLoopControl();
-        locs.push_back(converter.genLocation(
-            Fortran::parser::FindSourceLocation(outerDoConstruct)));
-      } else {
-        auto *doCons = crtEval->getIf<Fortran::parser::DoConstruct>();
-        assert(doCons && "expect do construct");
-        loopControl = &*doCons->GetLoopControl();
-        locs.push_back(converter.genLocation(
-            Fortran::parser::FindSourceLocation(*doCons)));
-      }
-
-      const Fortran::parser::LoopControl::Bounds *bounds =
-          std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u);
-      assert(bounds && "Expected bounds on the loop construct");
-      lowerbounds.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(bounds->lower), stmtCtx)));
-      upperbounds.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(bounds->upper), stmtCtx)));
-      if (bounds->step)
-        steps.push_back(fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(bounds->step), stmtCtx)));
-      else // If `step` is not present, assume it is `1`.
-        steps.push_back(builder.createIntegerConstant(
-            currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
-
-      Fortran::semantics::Symbol &ivSym =
-          bounds->name.thing.symbol->GetUltimate();
-      privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs,
-                  privateOperands, ivPrivate, privatizationRecipes);
-
-      inclusiveBounds.push_back(true);
-
-      if (i < loopCount - 1)
-        crtEval = &*std::next(crtEval->getNestedEvaluations().begin());
-    }
-  }
-
-  // Prepare the operand segment size attribute and the operands value range.
-  llvm::SmallVector<mlir::Value> operands;
-  llvm::SmallVector<int32_t> operandSegments;
-  addOperands(operands, operandSegments, lowerbounds);
-  addOperands(operands, operandSegments, upperbounds);
-  addOperands(operands, operandSegments, steps);
-  addOperands(operands, operandSegments, gangOperands);
-  addOperands(operands, operandSegments, workerNumOperands);
-  addOperands(operands, operandSegments, vectorOperands);
-  addOperands(operands, operandSegments, tileOperands);
-  addOperands(operands, operandSegments, cacheOperands);
-  addOperands(operands, operandSegments, privateOperands);
-  addOperands(operands, operandSegments, reductionOperands);
-
  llvm::SmallVector<mlir::Type> retTy;
  mlir::Value yieldValue;
  if (needEarlyReturnHandling) {
@ -2434,16 +2496,13 @@ static mlir::acc::LoopOp createLoopOp(
    retTy.push_back(i1Ty);
  }

-  auto loopOp = createRegionOp<mlir::acc::LoopOp, mlir::acc::YieldOp>(
-      builder, builder.getFusedLoc(locs), currentLocation, eval, operands,
-      operandSegments, /*outerCombined=*/false, retTy, yieldValue, ivTypes,
-      ivLocs);
-
-  for (auto [arg, value] : llvm::zip(
-           loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate))
-    fir::StoreOp::create(builder, currentLocation, arg, value);
-
-  loopOp.setInclusiveUpperbound(inclusiveBounds);
+  uint64_t loopsToProcess =
+      Fortran::lower::getLoopCountForCollapseAndTile(accClauseList);
+  auto loopOp = buildACCLoopOp(
+      converter, currentLocation, semanticsContext, stmtCtx, outerDoConstruct,
+      eval, privateOperands, privatizationRecipes, gangOperands,
+      workerNumOperands, vectorOperands, tileOperands, cacheOperands,
+      reductionOperands, retTy, yieldValue, loopsToProcess);

  if (!gangDeviceTypes.empty())
    loopOp.setGangAttr(builder.getArrayAttr(gangDeviceTypes));
@ -4899,6 +4958,12 @@ bool Fortran::lower::isInOpenACCLoop(fir::FirOpBuilder &builder) {
  return false;
 }

+bool Fortran::lower::isInsideOpenACCComputeConstruct(
+    fir::FirOpBuilder &builder) {
+  return mlir::isa_and_nonnull<ACC_COMPUTE_CONSTRUCT_OPS>(
+      mlir::acc::getEnclosingComputeOp(builder.getRegion()));
+}
+
 void Fortran::lower::setInsertionPointAfterOpenACCLoopIfInside(
    fir::FirOpBuilder &builder) {
  if (auto loopOp =
@ -4913,10 +4978,10 @@ void Fortran::lower::genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &builder,
  mlir::acc::YieldOp::create(builder, loc, yieldValue);
 }

-int64_t Fortran::lower::getLoopCountForCollapseAndTile(
+uint64_t Fortran::lower::getLoopCountForCollapseAndTile(
    const Fortran::parser::AccClauseList &clauseList) {
-  int64_t collapseLoopCount = 1;
-  int64_t tileLoopCount = 1;
+  uint64_t collapseLoopCount = 1;
+  uint64_t tileLoopCount = 1;
  for (const Fortran::parser::AccClause &clause : clauseList.v) {
    if (const auto *collapseClause =
            std::get_if<Fortran::parser::AccClause::Collapse>(&clause.u)) {
@ -4935,3 +5000,101 @@ int64_t Fortran::lower::getLoopCountForCollapseAndTile(
    return tileLoopCount;
  return collapseLoopCount;
 }
+
+/// Create an ACC loop operation for a DO construct when inside ACC compute
+/// constructs This serves as a bridge between regular DO construct handling and
+/// ACC loop creation
+mlir::Operation *Fortran::lower::genOpenACCLoopFromDoConstruct(
+    AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semanticsContext,
+    Fortran::lower::SymMap &localSymbols,
+    const Fortran::parser::DoConstruct &doConstruct, pft::Evaluation &eval) {
+  // Only convert loops which have induction variables that need privatized.
+  if (!doConstruct.IsDoNormal() && !doConstruct.IsDoConcurrent())
+    return nullptr;
+
+  // If the evaluation is unstructured, then we cannot convert the loop
+  // because acc loop does not have an unstructured form.
+  // TODO: There may be other strategies that can be employed such
+  // as generating acc.private for the loop variables without attaching
+  // them to acc.loop.
+  // For now - generate a not-yet-implemented message because without
+  // privatizing the induction variable, the loop may not execute correctly.
+  // Only do this for `acc kernels` because in `acc parallel`, scalars end
+  // up as implicitly firstprivate.
+  if (eval.lowerAsUnstructured()) {
+    if (mlir::isa_and_present<mlir::acc::KernelsOp>(
+            mlir::acc::getEnclosingComputeOp(
+                converter.getFirOpBuilder().getRegion())))
+      TODO(converter.getCurrentLocation(),
+           "unstructured do loop in acc kernels");
+    return nullptr;
+  }
+
+  // Open up a new scope for the loop variables.
+  localSymbols.pushScope();
+  auto scopeGuard = llvm::make_scope_exit([&]() { localSymbols.popScope(); });
+
+  // Prepare empty operand vectors since there are no associated `acc loop`
+  // clauses with the Fortran do loops being handled here.
+  llvm::SmallVector<mlir::Value> privateOperands, gangOperands,
+      workerNumOperands, vectorOperands, tileOperands, cacheOperands,
+      reductionOperands;
+  llvm::SmallVector<mlir::Attribute> privatizationRecipes;
+  llvm::SmallVector<mlir::Type> retTy;
+  mlir::Value yieldValue;
+  uint64_t loopsToProcess = 1; // Single loop construct
+
+  // Use same mechanism that handles `acc loop` contained do loops to handle
+  // the implicit loop case.
+  Fortran::lower::StatementContext stmtCtx;
+  auto loopOp = buildACCLoopOp(
+      converter, converter.getCurrentLocation(), semanticsContext, stmtCtx,
+      doConstruct, eval, privateOperands, privatizationRecipes, gangOperands,
+      workerNumOperands, vectorOperands, tileOperands, cacheOperands,
+      reductionOperands, retTy, yieldValue, loopsToProcess);
+
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  if (!privatizationRecipes.empty())
+    loopOp.setPrivatizationRecipesAttr(mlir::ArrayAttr::get(
+        converter.getFirOpBuilder().getContext(), privatizationRecipes));
+
+  // Normal do loops which are not annotated with `acc loop` should be
+  // left for analysis by marking with `auto`. This is the case even in the case
+  // of `acc parallel` region because the normal rules of applying `independent`
+  // is only for loops marked with `acc loop`.
+  // For do concurrent loops, the spec says in section 2.17.2:
+  // "When do concurrent appears without a loop construct in a kernels construct
+  // it is treated as if it is annotated with loop auto. If it appears in a
+  // parallel construct or an accelerator routine then it is treated as if it is
+  // annotated with loop independent."
+  // So this means that in all cases we mark with `auto` unless it is a
+  // `do concurrent` in an `acc parallel` construct or it must be `seq` because
+  // it is in an `acc serial` construct.
+  mlir::Operation *accRegionOp =
+      mlir::acc::getEnclosingComputeOp(converter.getFirOpBuilder().getRegion());
+  mlir::acc::LoopParMode parMode =
+      mlir::isa_and_present<mlir::acc::ParallelOp>(accRegionOp) &&
+              doConstruct.IsDoConcurrent()
+          ? mlir::acc::LoopParMode::loop_independent
+      : mlir::isa_and_present<mlir::acc::SerialOp>(accRegionOp)
+          ? mlir::acc::LoopParMode::loop_seq
+          : mlir::acc::LoopParMode::loop_auto;
+
+  // Set the parallel mode based on the computed parMode
+  auto deviceNoneAttr = mlir::acc::DeviceTypeAttr::get(
+      builder.getContext(), mlir::acc::DeviceType::None);
+  auto arrOfDeviceNone =
+      mlir::ArrayAttr::get(builder.getContext(), deviceNoneAttr);
+  if (parMode == mlir::acc::LoopParMode::loop_independent) {
+    loopOp.setIndependentAttr(arrOfDeviceNone);
+  } else if (parMode == mlir::acc::LoopParMode::loop_seq) {
+    loopOp.setSeqAttr(arrOfDeviceNone);
+  } else if (parMode == mlir::acc::LoopParMode::loop_auto) {
+    loopOp.setAuto_Attr(arrOfDeviceNone);
+  } else {
+    llvm_unreachable("Unexpected loop par mode");
+  }
+
+  return loopOp;
+}
--- a/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90
+++ b/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90
@ -0,0 +1,91 @@
+! RUN: split-file %s %t
+! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/do_loop_with_stop.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK1
+! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/do_loop_with_cycle_goto.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK2
+! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/nested_goto_loop.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK3
+! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/nested_loop_with_inner_goto.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK4
+
+//--- do_loop_with_stop.f90
+
+subroutine do_loop_with_stop()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  !$acc kernels
+  do i = 1, n
+    a(i) = b(i) + 1.0
+    if (i == 5) stop
+  end do
+  !$acc end kernels
+
+! CHECK1: not yet implemented: unstructured do loop in acc kernels
+
+end subroutine
+
+//--- do_loop_with_cycle_goto.f90
+
+subroutine do_loop_with_cycle_goto()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Do loop with cycle and goto - unstructured control flow is not converted.
+  !$acc kernels
+  do i = 1, n
+    if (i == 3) cycle
+    a(i) = b(i) + 1.0
+    if (i == 7) goto 200
+    a(i) = a(i) * 2.0
+  end do
+200 continue
+  !$acc end kernels
+
+! CHECK2: not yet implemented: unstructured do loop in acc kernels
+
+end subroutine
+
+//--- nested_goto_loop.f90
+
+subroutine nested_goto_loop()
+  integer :: i, j
+  integer, parameter :: n = 10, m = 5
+  real, dimension(n,m) :: a, b
+
+  ! Nested loop with goto from inner to outer - should NOT convert to acc.loop
+  !$acc kernels
+  do i = 1, n
+    do j = 1, m
+      a(i,j) = b(i,j) + 1.0
+      if (i * j > 20) goto 300  ! Exit both loops
+    end do
+  end do
+300 continue
+  !$acc end kernels
+
+! CHECK3: not yet implemented: unstructured do loop in acc kernels
+
+end subroutine
+
+//--- nested_loop_with_inner_goto.f90
+
+subroutine nested_loop_with_inner_goto()
+  integer :: ii = 0, jj = 0
+  integer, parameter :: nn = 3
+  real, dimension(nn, nn) :: aa
+  
+  aa = -1
+  
+  ! Nested loop with goto from inner loop - unstructured control flow is not converted.
+  !$acc kernels
+  do ii = 1, nn
+    do jj = 1, nn
+      if (jj > 1) goto 300
+      aa(jj, ii) = 1337
+    end do
+    300 continue
+  end do
+  !$acc end kernels
+
+! CHECK4: not yet implemented: unstructured do loop in acc kernels
+
+end subroutine
--- a/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
+++ b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
@ -0,0 +1,267 @@
+! This test checks lowering of Fortran do loops and do concurrent loops to OpenACC loop constructs.
+! Tests the new functionality that converts Fortran iteration constructs to acc.loop with proper IV handling.
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK-LABEL: func.func @_QPbasic_do_loop
+subroutine basic_do_loop()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Basic do loop that should be converted to acc.loop
+  !$acc kernels
+  do i = 1, n
+    a(i) = b(i) + 1.0
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_concurrent
+subroutine basic_do_concurrent()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Basic do concurrent loop
+  !$acc kernels
+  do concurrent (i = 1:n)
+    a(i) = b(i) + 1.0
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_loop_parallel
+subroutine basic_do_loop_parallel()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Basic do loop with acc parallel that should be converted to acc.loop
+  !$acc parallel
+  do i = 1, n
+    a(i) = b(i) + 1.0
+  end do
+  !$acc end parallel
+
+! CHECK: acc.parallel {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_loop_serial
+subroutine basic_do_loop_serial()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Basic do loop with acc serial that should be converted to acc.loop
+  !$acc serial
+  do i = 1, n
+    a(i) = b(i) + 1.0
+  end do
+  !$acc end serial
+
+! CHECK: acc.serial {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_concurrent_parallel
+subroutine basic_do_concurrent_parallel()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Basic do concurrent loop with acc parallel
+  !$acc parallel
+  do concurrent (i = 1:n)
+    a(i) = b(i) + 1.0
+  end do
+  !$acc end parallel
+
+! CHECK: acc.parallel {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_concurrent_serial
+subroutine basic_do_concurrent_serial()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Basic do concurrent loop with acc serial
+  !$acc serial
+  do concurrent (i = 1:n)
+    a(i) = b(i) + 1.0
+  end do
+  !$acc end serial
+
+! CHECK: acc.serial {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPmulti_dimension_do_concurrent
+subroutine multi_dimension_do_concurrent()
+  integer :: i, j, k
+  integer, parameter :: n = 10, m = 20, l = 5
+  real, dimension(n,m,l) :: a, b
+
+  ! Multi-dimensional do concurrent with multiple iteration variables
+  !$acc kernels
+  do concurrent (i = 1:n, j = 1:m, k = 1:l)
+    a(i,j,k) = b(i,j,k) * 2.0
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%{{.*}}, %{{.*}}, %{{.*}} : i32, i32, i32) step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>}
+end subroutine
+
+
+! CHECK-LABEL: func.func @_QPnested_do_loops
+subroutine nested_do_loops()
+  integer :: i, j
+  integer, parameter :: n = 10, m = 20
+  real, dimension(n,m) :: a, b
+
+  ! Nested do loops
+  !$acc kernels
+  do i = 1, n
+    do j = 1, m
+      a(i,j) = b(i,j) + i + j
+    end do
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPvariable_bounds_and_step
+subroutine variable_bounds_and_step(n, start_val, step_val)
+  integer, intent(in) :: n, start_val, step_val
+  integer :: i
+  real, dimension(n) :: a, b
+
+  ! Do loop with variable bounds and step
+  !$acc kernels
+  do i = start_val, n, step_val
+    a(i) = b(i) * 2.0
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPdifferent_iv_types
+subroutine different_iv_types()
+  integer(kind=8) :: i8
+  integer(kind=4) :: i4
+  integer(kind=2) :: i2
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b, c, d
+
+  ! Test different iteration variable types
+  !$acc kernels
+  do i8 = 1_8, int(n,8)
+    a(i8) = b(i8) + 1.0
+  end do
+  !$acc end kernels
+
+  !$acc kernels
+  do i4 = 1, n
+    b(i4) = c(i4) + 1.0
+  end do
+  !$acc end kernels
+
+  !$acc kernels
+  do i2 = 1_2, int(n,2)
+    c(i2) = d(i2) + 1.0
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i64) = (%{{.*}} : i64) to (%{{.*}} : i64) step (%{{.*}} : i64)
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i16) = (%{{.*}} : i16) to (%{{.*}} : i16) step (%{{.*}} : i16)
+
+end subroutine
+
+! -----------------------------------------------------------------------------------------
+! Tests for loops that should NOT be converted to acc.loop due to unstructured control flow
+
+! CHECK-LABEL: func.func @_QPinfinite_loop_no_iv
+subroutine infinite_loop_no_iv()
+  integer :: i
+  logical :: condition
+
+  ! Infinite loop with no induction variable - should NOT convert to acc.loop
+  !$acc kernels
+  do
+    i = i + 1
+    if (i > 100) exit
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK-NOT: acc.loop
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPwhile_like_loop
+subroutine while_like_loop()
+  integer :: i
+  logical :: condition
+
+  i = 1
+  condition = .true.
+
+  ! While-like infinite loop - should NOT convert to acc.loop
+  !$acc kernels
+  do while (condition)
+    i = i + 1
+    if (i > 100) condition = .false.
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK-NOT: acc.loop
+
+end subroutine
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@ -134,6 +134,24 @@ def OpenACC_VariableTypeCategory : I32BitEnumAttr<
  let printBitEnumPrimaryGroups = 1;
 }

+// These are parallelism determination modes for `acc loop`.
+// In the enum names, we use the "loop_" prefix because "auto" is
+// a language keyword - and thus for consistency all other cases
+// do the same.
+def OpenACC_LoopSeq : I32EnumAttrCase<"loop_seq", 0>;
+def OpenACC_LoopAuto : I32EnumAttrCase<"loop_auto", 1>;
+def OpenACC_LoopIndependent : I32EnumAttrCase<"loop_independent", 2>;
+
+def OpenACC_LoopParMode : I32EnumAttr<
+    "LoopParMode",
+    "Encodes the options for loop parallelism determination mode",
+    [
+      OpenACC_LoopAuto, OpenACC_LoopIndependent,
+      OpenACC_LoopSeq]> {
+  let cppNamespace = "::mlir::acc";
+  let genSpecializedAttr = 0;
+}
+
 // Type used in operation below.
 def IntOrIndex : AnyTypeOf<[AnyInteger, Index]>;

@ -2373,6 +2391,11 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
    // Return whether this LoopOp has a gang, worker, or vector applying to the
    // 'default'/None device-type.
    bool hasDefaultGangWorkerVector();
+
+    // Used to obtain the parallelism mode for the requested device type.
+    // This first checks if the mode is set for the device_type requested.
+    // And if not, it returns the non-device_type mode.
+    LoopParMode getDefaultOrDeviceTypeParallelism(DeviceType);
  }];

  let hasCustomAssemblyFormat = 1;
@ -2404,6 +2427,53 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
  }];

  let hasVerifier = 1;
+
+  let builders = [
+    OpBuilder<(ins "::mlir::ValueRange":$lowerbounds,
+                   "::mlir::ValueRange":$upperbounds,
+                   "::mlir::ValueRange":$steps,
+                   "LoopParMode":$parMode), [{
+        auto deviceNoneAttr = mlir::acc::DeviceTypeAttr::get(
+          $_builder.getContext(), mlir::acc::DeviceType::None);
+        auto arrOfDeviceNone = mlir::ArrayAttr::get(
+          $_builder.getContext(), deviceNoneAttr);
+        build($_builder, $_state,
+          /*results=*/{},
+          /*lowerbound=*/lowerbounds,
+          /*upperbound=*/upperbounds,
+          /*step=*/steps,
+          /*inclusiveUpperbound=*/nullptr,
+          /*collapse=*/nullptr,
+          /*collapseDeviceType=*/nullptr,
+          /*gangOperands=*/{},
+          /*gangOperandsArgType=*/nullptr,
+          /*gangOperandsSegments=*/nullptr,
+          /*gangOperandsDeviceType=*/nullptr,
+          /*workerNumOperands=*/{},
+          /*workerNumOperandsDeviceType=*/nullptr,
+          /*vectorOperands=*/{},
+          /*vectorOperandsDeviceType=*/nullptr,
+          /*seq=*/parMode == LoopParMode::loop_seq ?
+            arrOfDeviceNone : nullptr,
+          /*independent=*/parMode == LoopParMode::loop_independent ?
+            arrOfDeviceNone : nullptr,
+          /*auto_=*/parMode == LoopParMode::loop_auto ?
+            arrOfDeviceNone : nullptr,
+          /*gang=*/nullptr,
+          /*worker=*/nullptr,
+          /*vector=*/nullptr,
+          /*tileOperands=*/{},
+          /*tileOperandsSegments=*/nullptr,
+          /*tileOperandsDeviceType=*/nullptr,
+          /*cacheOperands=*/{},
+          /*privateOperands=*/{},
+          /*privatizationRecipes=*/nullptr,
+          /*reductionOperands=*/{},
+          /*reductionRecipes=*/nullptr,
+          /*combined=*/nullptr);
+      }]
+    >
+  ];
 }

 // Yield operation for the acc.loop and acc.parallel operations.
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@ -2957,6 +2957,23 @@ bool acc::LoopOp::hasDefaultGangWorkerVector() {
         getGangValue(GangArgType::Dim) || getGangValue(GangArgType::Static);
 }

+acc::LoopParMode
+acc::LoopOp::getDefaultOrDeviceTypeParallelism(DeviceType deviceType) {
+  if (hasSeq(deviceType))
+    return LoopParMode::loop_seq;
+  if (hasAuto(deviceType))
+    return LoopParMode::loop_auto;
+  if (hasIndependent(deviceType))
+    return LoopParMode::loop_independent;
+  if (hasSeq())
+    return LoopParMode::loop_seq;
+  if (hasAuto())
+    return LoopParMode::loop_auto;
+  assert(hasIndependent() &&
+         "loop must have default auto, seq, or independent");
+  return LoopParMode::loop_independent;
+}
+
 void acc::LoopOp::addGangOperands(
    MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes,
    llvm::ArrayRef<GangArgType> argTypes, mlir::ValueRange values) {