llvm-project/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp

//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "flang/Optimizer/Builder/DirectivesCommon.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/HLFIRTools.h"
#include "flang/Optimizer/Builder/Todo.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "flang/Optimizer/OpenMP/Passes.h"
#include "flang/Optimizer/OpenMP/Utils.h"
#include "flang/Support/OpenMP-utils.h"
#include "flang/Utils/OpenMP.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/ADT/SmallPtrSet.h"

namespace flangomp {
#define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
#include "flang/Optimizer/OpenMP/Passes.h.inc"
} // namespace flangomp

#define DEBUG_TYPE "do-concurrent-conversion"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")

namespace {
namespace looputils {
/// Stores info needed about the induction/iteration variable for each `do
/// concurrent` in a loop nest.
struct InductionVariableInfo {
  InductionVariableInfo(fir::DoConcurrentLoopOp loop,
                        mlir::Value inductionVar) {
    populateInfo(loop, inductionVar);
  }
  /// The operation allocating memory for iteration variable.
  mlir::Operation *iterVarMemDef;
  /// the operation(s) updating the iteration variable with the current
  /// iteration number.
  llvm::SmallVector<mlir::Operation *, 2> indVarUpdateOps;

private:
  /// For the \p doLoop parameter, find the following:
  ///
  /// 1. The operation that declares its iteration variable or allocates memory
  /// for it. For example, give the following loop:
  /// ```
  ///   ...
  ///   %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
  ///   ...
  ///   fir.do_concurrent.loop (%ind_var) = (%lb) to (%ub) step (%s) {
  ///     %ind_var_conv = fir.convert %ind_var : (index) -> i32
  ///     fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
  ///     ...
  ///   }
  /// ```
  ///
  /// This function sets the `iterVarMemDef` member to the `hlfir.declare` op
  /// for `%i`.
  ///
  /// 2. The operation(s) that update the loop's iteration variable from its
  /// induction variable. For the above example, the `indVarUpdateOps` is
  /// populated with the first 2 ops in the loop's body.
  ///
  /// Note: The current implementation is dependent on how flang emits loop
  /// bodies; which is sufficient for the current simple test/use cases. If this
  /// proves to be insufficient, this should be made more generic.
  void populateInfo(fir::DoConcurrentLoopOp loop, mlir::Value inductionVar) {
    mlir::Value result = nullptr;

    // Checks if a StoreOp is updating the memref of the loop's iteration
    // variable.
    auto isStoringIV = [&](fir::StoreOp storeOp) {
      // Direct store into the IV memref.
      if (storeOp.getValue() == inductionVar) {
        indVarUpdateOps.push_back(storeOp);
        return true;
      }

      // Indirect store into the IV memref.
      if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
              storeOp.getValue().getDefiningOp())) {
        if (convertOp.getOperand() == inductionVar) {
          indVarUpdateOps.push_back(convertOp);
          indVarUpdateOps.push_back(storeOp);
          return true;
        }
      }

      return false;
    };

    for (mlir::Operation &op : loop) {
      if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
        if (isStoringIV(storeOp)) {
          result = storeOp.getMemref();
          break;
        }
    }

    assert(result != nullptr && result.getDefiningOp() != nullptr);
    iterVarMemDef = result.getDefiningOp();
  }
};

using InductionVariableInfos = llvm::SmallVector<InductionVariableInfo>;

/// Collect the list of values used inside the loop but defined outside of it.
void collectLoopLiveIns(fir::DoConcurrentLoopOp loop,
                        llvm::SmallVectorImpl<mlir::Value> &liveIns) {
  llvm::SmallDenseSet<mlir::Value> seenValues;
  llvm::SmallPtrSet<mlir::Operation *, 8> seenOps;

  for (auto [lb, ub, st] : llvm::zip_equal(
           loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) {
    liveIns.push_back(lb);
    liveIns.push_back(ub);
    liveIns.push_back(st);
  }

  mlir::visitUsedValuesDefinedAbove(
      loop.getRegion(), [&](mlir::OpOperand *operand) {
        if (!seenValues.insert(operand->get()).second)
          return;

        mlir::Operation *definingOp = operand->get().getDefiningOp();
        // We want to collect ops corresponding to live-ins only once.
        if (definingOp && !seenOps.insert(definingOp).second)
          return;

        liveIns.push_back(operand->get());
      });

  for (mlir::Value local : loop.getLocalVars())
    liveIns.push_back(local);

  for (mlir::Value reduce : loop.getReduceVars())
    liveIns.push_back(reduce);
}

/// Collects values that are local to a loop: "loop-local values". A loop-local
/// value is one that is used exclusively inside the loop but allocated outside
/// of it. This usually corresponds to temporary values that are used inside the
/// loop body for initialzing other variables for example.
///
/// See `flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90` for an
/// example of why we need this.
///
/// \param [in] doLoop - the loop within which the function searches for values
/// used exclusively inside.
///
/// \param [out] locals - the list of loop-local values detected for \p doLoop.
void collectLoopLocalValues(fir::DoConcurrentLoopOp loop,
                            llvm::SetVector<mlir::Value> &locals) {
  loop.walk([&](mlir::Operation *op) {
    for (mlir::Value operand : op->getOperands()) {
      if (locals.contains(operand))
        continue;

      bool isLocal = true;

      if (!mlir::isa_and_present<fir::AllocaOp>(operand.getDefiningOp()))
        continue;

      // Values defined inside the loop are not interesting since they do not
      // need to be localized.
      if (loop->isAncestor(operand.getDefiningOp()))
        continue;

      for (auto *user : operand.getUsers()) {
        if (!loop->isAncestor(user)) {
          isLocal = false;
          break;
        }
      }

      if (isLocal)
        locals.insert(operand);
    }
  });
}

/// For a "loop-local" value \p local within a loop's scope, localizes that
/// value within the scope of the parallel region the loop maps to. Towards that
/// end, this function moves the allocation of \p local within \p allocRegion.
///
/// \param local - the value used exclusively within a loop's scope (see
/// collectLoopLocalValues).
///
/// \param allocRegion - the parallel region where \p local's allocation will be
/// privatized.
///
/// \param rewriter - builder used for updating \p allocRegion.
static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion,
                                   mlir::ConversionPatternRewriter &rewriter) {
  rewriter.moveOpBefore(local.getDefiningOp(), &allocRegion.front().front());
}
} // namespace looputils

class DoConcurrentConversion
    : public mlir::OpConversionPattern<fir::DoConcurrentOp> {
private:
  struct TargetDeclareShapeCreationInfo {
    // Note: We use `std::vector` (rather than `llvm::SmallVector` as usual) to
    // interface more easily `ShapeShiftOp::getOrigins()` which returns
    // `std::vector`.
    std::vector<mlir::Value> startIndices;
    std::vector<mlir::Value> extents;

    TargetDeclareShapeCreationInfo(mlir::Value liveIn) {
      mlir::Value shape = nullptr;
      mlir::Operation *liveInDefiningOp = liveIn.getDefiningOp();
      auto declareOp =
          mlir::dyn_cast_if_present<hlfir::DeclareOp>(liveInDefiningOp);

      if (declareOp != nullptr)
        shape = declareOp.getShape();

      if (!shape)
        return;

      auto shapeOp =
          mlir::dyn_cast_if_present<fir::ShapeOp>(shape.getDefiningOp());
      auto shapeShiftOp =
          mlir::dyn_cast_if_present<fir::ShapeShiftOp>(shape.getDefiningOp());

      if (!shapeOp && !shapeShiftOp)
        TODO(liveIn.getLoc(),
             "Shapes not defined by `fir.shape` or `fir.shape_shift` op's are"
             "not supported yet.");

      if (shapeShiftOp != nullptr)
        startIndices = shapeShiftOp.getOrigins();

      extents = shapeOp != nullptr
                    ? std::vector<mlir::Value>(shapeOp.getExtents().begin(),
                                               shapeOp.getExtents().end())
                    : shapeShiftOp.getExtents();
    }

    bool isShapedValue() const { return !extents.empty(); }
    bool isShapeShiftedValue() const { return !startIndices.empty(); }
  };

  using LiveInShapeInfoMap =
      llvm::DenseMap<mlir::Value, TargetDeclareShapeCreationInfo>;

public:
  using mlir::OpConversionPattern<fir::DoConcurrentOp>::OpConversionPattern;

  DoConcurrentConversion(
      mlir::MLIRContext *context, bool mapToDevice,
      llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip,
      mlir::SymbolTable &moduleSymbolTable)
      : OpConversionPattern(context), mapToDevice(mapToDevice),
        concurrentLoopsToSkip(concurrentLoopsToSkip),
        moduleSymbolTable(moduleSymbolTable) {}

  mlir::LogicalResult
  matchAndRewrite(fir::DoConcurrentOp doLoop, OpAdaptor adaptor,
                  mlir::ConversionPatternRewriter &rewriter) const override {
    looputils::InductionVariableInfos ivInfos;
    auto loop = mlir::cast<fir::DoConcurrentLoopOp>(
        doLoop.getRegion().back().getTerminator());

    auto indVars = loop.getLoopInductionVars();
    assert(indVars.has_value());

    for (mlir::Value indVar : *indVars)
      ivInfos.emplace_back(loop, indVar);

    llvm::SmallVector<mlir::Value> loopNestLiveIns;
    looputils::collectLoopLiveIns(loop, loopNestLiveIns);
    assert(!loopNestLiveIns.empty());

    llvm::SetVector<mlir::Value> locals;
    looputils::collectLoopLocalValues(loop, locals);

    // We do not want to map "loop-local" values to the device through
    // `omp.map.info` ops. Therefore, we remove them from the list of live-ins.
    loopNestLiveIns.erase(llvm::remove_if(loopNestLiveIns,
                                          [&](mlir::Value liveIn) {
                                            return locals.contains(liveIn);
                                          }),
                          loopNestLiveIns.end());

    mlir::omp::TargetOp targetOp;
    mlir::omp::LoopNestOperands loopNestClauseOps;

    mlir::IRMapping mapper;

    if (mapToDevice) {
      mlir::ModuleOp module = doLoop->getParentOfType<mlir::ModuleOp>();
      bool isTargetDevice =
          llvm::cast<mlir::omp::OffloadModuleInterface>(*module)
              .getIsTargetDevice();

      mlir::omp::TargetOperands targetClauseOps;
      genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, loopNestClauseOps,
                           isTargetDevice ? nullptr : &targetClauseOps);

      LiveInShapeInfoMap liveInShapeInfoMap;
      fir::FirOpBuilder builder(
          rewriter,
          fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>()));

      for (mlir::Value liveIn : loopNestLiveIns) {
        targetClauseOps.mapVars.push_back(
            genMapInfoOpForLiveIn(builder, liveIn));
        liveInShapeInfoMap.insert(
            {liveIn, TargetDeclareShapeCreationInfo(liveIn)});
      }

      targetOp =
          genTargetOp(doLoop.getLoc(), rewriter, mapper, loopNestLiveIns,
                      targetClauseOps, loopNestClauseOps, liveInShapeInfoMap);
      genTeamsOp(rewriter, loop, mapper);
    }

    mlir::omp::ParallelOp parallelOp =
        genParallelOp(rewriter, loop, ivInfos, mapper);

    // Only set as composite when part of `distribute parallel do`.
    parallelOp.setComposite(mapToDevice);

    if (!mapToDevice)
      genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, loopNestClauseOps);

    for (mlir::Value local : locals)
      looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
                                        rewriter);

    if (mapToDevice)
      genDistributeOp(doLoop.getLoc(), rewriter).setComposite(/*val=*/true);

    auto [loopNestOp, wsLoopOp] =
        genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps,
                    /*isComposite=*/mapToDevice);

    // `local` region arguments are transferred/cloned from the `do concurrent`
    // loop to the loopnest op when the region is cloned above. Instead, these
    // region arguments should be on the workshare loop's region.
    if (mapToDevice) {
      for (auto [parallelArg, loopNestArg] : llvm::zip_equal(
               parallelOp.getRegion().getArguments(),
               loopNestOp.getRegion().getArguments().slice(
                   loop.getLocalOperandsStart(), loop.getNumLocalOperands())))
        rewriter.replaceAllUsesWith(loopNestArg, parallelArg);

      for (auto [wsloopArg, loopNestArg] : llvm::zip_equal(
               wsLoopOp.getRegion().getArguments(),
               loopNestOp.getRegion().getArguments().slice(
                   loop.getReduceOperandsStart(), loop.getNumReduceOperands())))
        rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
    } else {
      for (auto [wsloopArg, loopNestArg] :
           llvm::zip_equal(wsLoopOp.getRegion().getArguments(),
                           loopNestOp.getRegion().getArguments().drop_front(
                               loopNestClauseOps.loopLowerBounds.size())))
        rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
    }

    for (unsigned i = 0;
         i < loop.getLocalVars().size() + loop.getReduceVars().size(); ++i)
      loopNestOp.getRegion().eraseArgument(
          loopNestClauseOps.loopLowerBounds.size());

    rewriter.setInsertionPoint(doLoop);
    fir::FirOpBuilder builder(
        rewriter,
        fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>()));

    // Collect iteration variable(s) allocations so that we can move them
    // outside the `fir.do_concurrent` wrapper (before erasing it).
    llvm::SmallVector<mlir::Operation *> opsToMove;
    for (mlir::Operation &op : llvm::drop_end(doLoop))
      opsToMove.push_back(&op);

    mlir::Block *allocBlock = builder.getAllocaBlock();

    for (mlir::Operation *op : llvm::reverse(opsToMove)) {
      rewriter.moveOpBefore(op, allocBlock, allocBlock->begin());
    }

    // Mark `unordered` loops that are not perfectly nested to be skipped from
    // the legality check of the `ConversionTarget` since we are not interested
    // in mapping them to OpenMP.
    loopNestOp->walk([&](fir::DoConcurrentOp doLoop) {
      concurrentLoopsToSkip.insert(doLoop);
    });

    rewriter.eraseOp(doLoop);

    return mlir::success();
  }

private:
  mlir::omp::ParallelOp
  genParallelOp(mlir::ConversionPatternRewriter &rewriter,
                fir::DoConcurrentLoopOp loop,
                looputils::InductionVariableInfos &ivInfos,
                mlir::IRMapping &mapper) const {
    mlir::omp::ParallelOperands parallelOps;

    if (mapToDevice)
      genPrivatizers(rewriter, mapper, loop, parallelOps);

    mlir::Location loc = loop.getLoc();
    auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc, parallelOps);
    Fortran::common::openmp::EntryBlockArgs parallelArgs;
    parallelArgs.priv.vars = parallelOps.privateVars;
    Fortran::common::openmp::genEntryBlock(rewriter, parallelArgs,
                                           parallelOp.getRegion());
    rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));

    genLoopNestIndVarAllocs(rewriter, ivInfos, mapper);
    return parallelOp;
  }

  void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
                               looputils::InductionVariableInfos &ivInfos,
                               mlir::IRMapping &mapper) const {

    for (auto &indVarInfo : ivInfos)
      genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
  }

  mlir::Operation *
  genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter,
                            mlir::Operation *indVarMemDef,
                            mlir::IRMapping &mapper) const {
    assert(
        indVarMemDef != nullptr &&
        "Induction variable memdef is expected to have a defining operation.");

    llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc;
    for (auto operand : indVarMemDef->getOperands())
      indVarDeclareAndAlloc.insert(operand.getDefiningOp());
    indVarDeclareAndAlloc.insert(indVarMemDef);

    mlir::Operation *result;
    for (mlir::Operation *opToClone : indVarDeclareAndAlloc)
      result = rewriter.clone(*opToClone, mapper);

    return result;
  }

  void genLoopNestClauseOps(
      mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
      fir::DoConcurrentLoopOp loop,
      mlir::omp::LoopNestOperands &loopNestClauseOps,
      mlir::omp::TargetOperands *targetClauseOps = nullptr) const {
    assert(loopNestClauseOps.loopLowerBounds.empty() &&
           "Loop nest bounds were already emitted!");

    auto populateBounds = [](mlir::Value var,
                             llvm::SmallVectorImpl<mlir::Value> &bounds) {
      bounds.push_back(var.getDefiningOp()->getResult(0));
    };

    auto hostEvalCapture = [&](mlir::Value var,
                               llvm::SmallVectorImpl<mlir::Value> &bounds) {
      populateBounds(var, bounds);

      // Ensure that loop-nest bounds are evaluated in the host and forwarded to
      // the nested omp constructs when we map to the device.
      if (targetClauseOps)
        targetClauseOps->hostEvalVars.push_back(var);
    };

    for (auto [lb, ub, st] : llvm::zip_equal(
             loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) {
      hostEvalCapture(lb, loopNestClauseOps.loopLowerBounds);
      hostEvalCapture(ub, loopNestClauseOps.loopUpperBounds);
      hostEvalCapture(st, loopNestClauseOps.loopSteps);
    }

    loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
  }

  std::pair<mlir::omp::LoopNestOp, mlir::omp::WsloopOp>
  genWsLoopOp(mlir::ConversionPatternRewriter &rewriter,
              fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
              const mlir::omp::LoopNestOperands &clauseOps,
              bool isComposite) const {
    mlir::omp::WsloopOperands wsloopClauseOps;
    if (!mapToDevice)
      genPrivatizers(rewriter, mapper, loop, wsloopClauseOps);

    genReductions(rewriter, mapper, loop, wsloopClauseOps);

    auto wsloopOp =
        mlir::omp::WsloopOp::create(rewriter, loop.getLoc(), wsloopClauseOps);
    wsloopOp.setComposite(isComposite);

    Fortran::common::openmp::EntryBlockArgs wsloopArgs;
    wsloopArgs.priv.vars = wsloopClauseOps.privateVars;
    wsloopArgs.reduction.vars = wsloopClauseOps.reductionVars;
    Fortran::common::openmp::genEntryBlock(rewriter, wsloopArgs,
                                           wsloopOp.getRegion());

    auto loopNestOp =
        mlir::omp::LoopNestOp::create(rewriter, loop.getLoc(), clauseOps);

    // Clone the loop's body inside the loop nest construct using the
    // mapped values.
    rewriter.cloneRegionBefore(loop.getRegion(), loopNestOp.getRegion(),
                               loopNestOp.getRegion().begin(), mapper);

    rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
    mlir::omp::YieldOp::create(rewriter, loop->getLoc());

    return {loopNestOp, wsloopOp};
  }

  void genBoundsOps(fir::FirOpBuilder &builder, mlir::Value liveIn,
                    mlir::Value rawAddr,
                    llvm::SmallVectorImpl<mlir::Value> &boundsOps) const {
    fir::ExtendedValue extVal =
        hlfir::translateToExtendedValue(rawAddr.getLoc(), builder,
                                        hlfir::Entity{liveIn},
                                        /*contiguousHint=*/
                                        true)
            .first;
    fir::factory::AddrAndBoundsInfo info = fir::factory::getDataOperandBaseAddr(
        builder, rawAddr, /*isOptional=*/false, rawAddr.getLoc());
    boundsOps = fir::factory::genImplicitBoundsOps<mlir::omp::MapBoundsOp,
                                                   mlir::omp::MapBoundsType>(
        builder, info, extVal,
        /*dataExvIsAssumedSize=*/false, rawAddr.getLoc());
  }

  mlir::omp::MapInfoOp genMapInfoOpForLiveIn(fir::FirOpBuilder &builder,
                                             mlir::Value liveIn) const {
    mlir::Value rawAddr = liveIn;
    llvm::StringRef name;

    mlir::Operation *liveInDefiningOp = liveIn.getDefiningOp();
    auto declareOp =
        mlir::dyn_cast_if_present<hlfir::DeclareOp>(liveInDefiningOp);

    if (declareOp != nullptr) {
      // Use the raw address to avoid unboxing `fir.box` values whenever
      // possible. Put differently, if we have access to the direct value memory
      // reference/address, we use it.
      rawAddr = declareOp.getOriginalBase();
      name = declareOp.getUniqName();
    }

    if (!llvm::isa<mlir::omp::PointerLikeType>(rawAddr.getType())) {
      mlir::OpBuilder::InsertionGuard guard(builder);
      builder.setInsertionPointAfter(liveInDefiningOp);
      auto copyVal = builder.createTemporary(liveIn.getLoc(), liveIn.getType());
      builder.createStoreWithConvert(copyVal.getLoc(), liveIn, copyVal);
      rawAddr = copyVal;
    }

    mlir::Type liveInType = liveIn.getType();
    mlir::Type eleType = liveInType;
    if (auto refType = mlir::dyn_cast<fir::ReferenceType>(liveInType))
      eleType = refType.getElementType();

    mlir::omp::ClauseMapFlags mapFlag = mlir::omp::ClauseMapFlags::implicit;
    mlir::omp::VariableCaptureKind captureKind =
        mlir::omp::VariableCaptureKind::ByRef;

    if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) {
      captureKind = mlir::omp::VariableCaptureKind::ByCopy;
    } else if (!fir::isa_builtin_cptr_type(eleType)) {
      mapFlag |= mlir::omp::ClauseMapFlags::to;
      mapFlag |= mlir::omp::ClauseMapFlags::from;
    }

    llvm::SmallVector<mlir::Value> boundsOps;
    genBoundsOps(builder, liveIn, rawAddr, boundsOps);

    return Fortran::utils::openmp::createMapInfoOp(
        builder, liveIn.getLoc(), rawAddr,
        /*varPtrPtr=*/{}, name.str(), boundsOps,
        /*members=*/{},
        /*membersIndex=*/mlir::ArrayAttr{}, mapFlag, captureKind,
        rawAddr.getType());
  }

  mlir::omp::TargetOp
  genTargetOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
              mlir::IRMapping &mapper, llvm::ArrayRef<mlir::Value> mappedVars,
              mlir::omp::TargetOperands &clauseOps,
              mlir::omp::LoopNestOperands &loopNestClauseOps,
              const LiveInShapeInfoMap &liveInShapeInfoMap) const {
    auto targetOp = mlir::omp::TargetOp::create(rewriter, loc, clauseOps);
    auto argIface = llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*targetOp);

    mlir::Region &region = targetOp.getRegion();

    llvm::SmallVector<mlir::Type> regionArgTypes;
    llvm::SmallVector<mlir::Location> regionArgLocs;

    for (auto var : llvm::concat<const mlir::Value>(clauseOps.hostEvalVars,
                                                    clauseOps.mapVars)) {
      regionArgTypes.push_back(var.getType());
      regionArgLocs.push_back(var.getLoc());
    }

    rewriter.createBlock(&region, {}, regionArgTypes, regionArgLocs);
    fir::FirOpBuilder builder(
        rewriter,
        fir::getKindMapping(targetOp->getParentOfType<mlir::ModuleOp>()));

    // Within the loop, it is possible that we discover other values that need
    // to be mapped to the target region (the shape info values for arrays, for
    // example). Therefore, the map block args might be extended and resized.
    // Hence, we invoke `argIface.getMapBlockArgs()` every iteration to make
    // sure we access the proper vector of data.
    int idx = 0;
    for (auto [mapInfoOp, mappedVar] :
         llvm::zip_equal(clauseOps.mapVars, mappedVars)) {
      auto miOp = mlir::cast<mlir::omp::MapInfoOp>(mapInfoOp.getDefiningOp());
      hlfir::DeclareOp liveInDeclare =
          genLiveInDeclare(builder, targetOp, argIface.getMapBlockArgs()[idx],
                           miOp, liveInShapeInfoMap.at(mappedVar));
      ++idx;

      // If `mappedVar.getDefiningOp()` is a `fir::BoxAddrOp`, we probably
      // need to "unpack" the box by getting the defining op of it's value.
      // However, we did not hit this case in reality yet so leaving it as a
      // todo for now.
      if (mlir::isa<fir::BoxAddrOp>(mappedVar.getDefiningOp()))
        TODO(mappedVar.getLoc(),
             "Mapped variabled defined by `BoxAddrOp` are not supported yet");

      auto mapHostValueToDevice = [&](mlir::Value hostValue,
                                      mlir::Value deviceValue) {
        if (!llvm::isa<mlir::omp::PointerLikeType>(hostValue.getType()))
          mapper.map(hostValue,
                     builder.loadIfRef(hostValue.getLoc(), deviceValue));
        else
          mapper.map(hostValue, deviceValue);
      };

      mapHostValueToDevice(mappedVar, liveInDeclare.getOriginalBase());

      if (auto origDeclareOp = mlir::dyn_cast_if_present<hlfir::DeclareOp>(
              mappedVar.getDefiningOp()))
        mapHostValueToDevice(origDeclareOp.getBase(), liveInDeclare.getBase());
    }

    for (auto [arg, hostEval] : llvm::zip_equal(argIface.getHostEvalBlockArgs(),
                                                clauseOps.hostEvalVars))
      mapper.map(hostEval, arg);

    for (unsigned i = 0; i < loopNestClauseOps.loopLowerBounds.size(); ++i) {
      loopNestClauseOps.loopLowerBounds[i] =
          mapper.lookup(loopNestClauseOps.loopLowerBounds[i]);
      loopNestClauseOps.loopUpperBounds[i] =
          mapper.lookup(loopNestClauseOps.loopUpperBounds[i]);
      loopNestClauseOps.loopSteps[i] =
          mapper.lookup(loopNestClauseOps.loopSteps[i]);
    }

    // Check if cloning the bounds introduced any dependency on the outer
    // region. If so, then either clone them as well if they are
    // MemoryEffectFree, or else copy them to a new temporary and add them to
    // the map and block_argument lists and replace their uses with the new
    // temporary.
    Fortran::utils::openmp::cloneOrMapRegionOutsiders(builder, targetOp);
    rewriter.setInsertionPoint(
        mlir::omp::TerminatorOp::create(rewriter, targetOp.getLoc()));

    return targetOp;
  }

  hlfir::DeclareOp genLiveInDeclare(
      fir::FirOpBuilder &builder, mlir::omp::TargetOp targetOp,
      mlir::Value liveInArg, mlir::omp::MapInfoOp liveInMapInfoOp,
      const TargetDeclareShapeCreationInfo &targetShapeCreationInfo) const {
    mlir::Type liveInType = liveInArg.getType();
    std::string liveInName = liveInMapInfoOp.getName().has_value()
                                 ? liveInMapInfoOp.getName().value().str()
                                 : std::string("");
    if (fir::isa_ref_type(liveInType))
      liveInType = fir::unwrapRefType(liveInType);

    mlir::Value shape = [&]() -> mlir::Value {
      if (!targetShapeCreationInfo.isShapedValue())
        return {};

      if (targetShapeCreationInfo.isShapeShiftedValue()) {
        llvm::SmallVector<mlir::Value> shapeShiftOperands;

        size_t shapeIdx = 0;
        for (auto [startIndex, extent] :
             llvm::zip_equal(targetShapeCreationInfo.startIndices,
                             targetShapeCreationInfo.extents)) {
          shapeShiftOperands.push_back(
              Fortran::utils::openmp::mapTemporaryValue(
                  builder, targetOp, startIndex,
                  liveInName + ".start_idx.dim" + std::to_string(shapeIdx)));
          shapeShiftOperands.push_back(
              Fortran::utils::openmp::mapTemporaryValue(
                  builder, targetOp, extent,
                  liveInName + ".extent.dim" + std::to_string(shapeIdx)));
          ++shapeIdx;
        }

        auto shapeShiftType = fir::ShapeShiftType::get(
            builder.getContext(), shapeShiftOperands.size() / 2);
        return fir::ShapeShiftOp::create(builder, liveInArg.getLoc(),
                                         shapeShiftType, shapeShiftOperands);
      }

      llvm::SmallVector<mlir::Value> shapeOperands;
      size_t shapeIdx = 0;
      for (auto extent : targetShapeCreationInfo.extents) {
        shapeOperands.push_back(Fortran::utils::openmp::mapTemporaryValue(
            builder, targetOp, extent,
            liveInName + ".extent.dim" + std::to_string(shapeIdx)));
        ++shapeIdx;
      }

      return fir::ShapeOp::create(builder, liveInArg.getLoc(), shapeOperands);
    }();

    return hlfir::DeclareOp::create(builder, liveInArg.getLoc(), liveInArg,
                                    liveInName, shape);
  }

  mlir::omp::TeamsOp genTeamsOp(mlir::ConversionPatternRewriter &rewriter,
                                fir::DoConcurrentLoopOp loop,
                                mlir::IRMapping &mapper) const {
    mlir::omp::TeamsOperands teamsOps;
    genReductions(rewriter, mapper, loop, teamsOps);

    mlir::Location loc = loop.getLoc();
    auto teamsOp = mlir::omp::TeamsOp::create(rewriter, loc, teamsOps);
    Fortran::common::openmp::EntryBlockArgs teamsArgs;
    teamsArgs.reduction.vars = teamsOps.reductionVars;
    Fortran::common::openmp::genEntryBlock(rewriter, teamsArgs,
                                           teamsOp.getRegion());

    rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));

    for (auto [loopVar, teamsArg] : llvm::zip_equal(
             loop.getReduceVars(), teamsOp.getRegion().getArguments())) {
      mapper.map(loopVar, teamsArg);
    }

    return teamsOp;
  }

  mlir::omp::DistributeOp
  genDistributeOp(mlir::Location loc,
                  mlir::ConversionPatternRewriter &rewriter) const {
    auto distOp = mlir::omp::DistributeOp::create(
        rewriter, loc, /*clauses=*/mlir::omp::DistributeOperands{});

    rewriter.createBlock(&distOp.getRegion());
    return distOp;
  }

  void cloneFIRRegionToOMP(mlir::ConversionPatternRewriter &rewriter,
                           mlir::Region &firRegion,
                           mlir::Region &ompRegion) const {
    if (!firRegion.empty()) {
      rewriter.cloneRegionBefore(firRegion, ompRegion, ompRegion.begin());
      auto firYield =
          mlir::cast<fir::YieldOp>(ompRegion.back().getTerminator());
      rewriter.setInsertionPoint(firYield);
      mlir::omp::YieldOp::create(rewriter, firYield.getLoc(),
                                 firYield.getOperands());
      rewriter.eraseOp(firYield);
    }
  }

  /// Generate bodies of OpenMP privatizers by cloning the bodies of FIR
  /// privatizers.
  ///
  /// \param [in] rewriter - used to driver IR generation for privatizers.
  /// \param [in] mapper - value mapping from FIR to OpenMP constructs.
  /// \param [in] loop - FIR loop to convert its localizers.
  ///
  /// \param [out] privateClauseOps - OpenMP privatizers to gen their bodies.
  void genPrivatizers(mlir::ConversionPatternRewriter &rewriter,
                      mlir::IRMapping &mapper, fir::DoConcurrentLoopOp loop,
                      mlir::omp::PrivateClauseOps &privateClauseOps) const {
    // For `local` (and `local_init`) operands, emit corresponding `private`
    // clauses and attach these clauses to the workshare loop.
    if (!loop.getLocalVars().empty())
      for (auto [var, sym, arg] : llvm::zip_equal(
               loop.getLocalVars(),
               loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
               loop.getRegionLocalArgs())) {
        auto localizer = moduleSymbolTable.lookup<fir::LocalitySpecifierOp>(
            sym.getLeafReference());
        if (localizer.getLocalitySpecifierType() ==
            fir::LocalitySpecifierType::LocalInit)
          TODO(localizer.getLoc(),
               "local_init conversion is not supported yet");

        mlir::OpBuilder::InsertionGuard guard(rewriter);
        rewriter.setInsertionPointAfter(localizer);

        auto privatizer = mlir::omp::PrivateClauseOp::create(
            rewriter, localizer.getLoc(), sym.getLeafReference().str() + ".omp",
            localizer.getTypeAttr().getValue(),
            mlir::omp::DataSharingClauseType::Private);

        cloneFIRRegionToOMP(rewriter, localizer.getInitRegion(),
                            privatizer.getInitRegion());
        cloneFIRRegionToOMP(rewriter, localizer.getDeallocRegion(),
                            privatizer.getDeallocRegion());

        moduleSymbolTable.insert(privatizer);

        privateClauseOps.privateVars.push_back(mapToDevice ? mapper.lookup(var)
                                                           : var);
        privateClauseOps.privateSyms.push_back(
            mlir::SymbolRefAttr::get(privatizer));
      }
  }

  void genReductions(mlir::ConversionPatternRewriter &rewriter,
                     mlir::IRMapping &mapper, fir::DoConcurrentLoopOp loop,
                     mlir::omp::ReductionClauseOps &reductionClauseOps) const {
    if (!loop.getReduceVars().empty()) {
      for (auto [var, byRef, sym, arg] : llvm::zip_equal(
               loop.getReduceVars(), loop.getReduceByrefAttr().asArrayRef(),
               loop.getReduceSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
               loop.getRegionReduceArgs())) {
        auto firReducer = moduleSymbolTable.lookup<fir::DeclareReductionOp>(
            sym.getLeafReference());

        mlir::OpBuilder::InsertionGuard guard(rewriter);
        rewriter.setInsertionPointAfter(firReducer);
        std::string ompReducerName = sym.getLeafReference().str() + ".omp";

        auto ompReducer =
            moduleSymbolTable.lookup<mlir::omp::DeclareReductionOp>(
                rewriter.getStringAttr(ompReducerName));

        if (!ompReducer) {
          ompReducer = mlir::omp::DeclareReductionOp::create(
              rewriter, firReducer.getLoc(), ompReducerName,
              firReducer.getTypeAttr().getValue(),
              firReducer.getByrefElementTypeAttr());

          cloneFIRRegionToOMP(rewriter, firReducer.getAllocRegion(),
                              ompReducer.getAllocRegion());
          cloneFIRRegionToOMP(rewriter, firReducer.getInitializerRegion(),
                              ompReducer.getInitializerRegion());
          cloneFIRRegionToOMP(rewriter, firReducer.getReductionRegion(),
                              ompReducer.getReductionRegion());
          cloneFIRRegionToOMP(rewriter, firReducer.getAtomicReductionRegion(),
                              ompReducer.getAtomicReductionRegion());
          cloneFIRRegionToOMP(rewriter, firReducer.getCleanupRegion(),
                              ompReducer.getCleanupRegion());
          moduleSymbolTable.insert(ompReducer);
        }

        reductionClauseOps.reductionVars.push_back(
            mapToDevice ? mapper.lookup(var) : var);
        reductionClauseOps.reductionByref.push_back(byRef);
        reductionClauseOps.reductionSyms.push_back(
            mlir::SymbolRefAttr::get(ompReducer));
      }
    }
  }

  bool mapToDevice;
  llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip;
  mlir::SymbolTable &moduleSymbolTable;
};

/// A listener that forwards notifyOperationErased to the given callback.
struct CallbackListener : public mlir::RewriterBase::Listener {
  CallbackListener(std::function<void(mlir::Operation *op)> onOperationErased)
      : onOperationErased(onOperationErased) {}

  void notifyOperationErased(mlir::Operation *op) override {
    onOperationErased(op);
  }

  std::function<void(mlir::Operation *op)> onOperationErased;
};

class DoConcurrentConversionPass
    : public flangomp::impl::DoConcurrentConversionPassBase<
          DoConcurrentConversionPass> {
public:
  DoConcurrentConversionPass() = default;

  DoConcurrentConversionPass(
      const flangomp::DoConcurrentConversionPassOptions &options)
      : DoConcurrentConversionPassBase(options) {}

  void runOnOperation() override {
    mlir::ModuleOp module = getOperation();
    mlir::MLIRContext *context = &getContext();
    mlir::SymbolTable moduleSymbolTable(module);

    if (mapTo != flangomp::DoConcurrentMappingKind::DCMK_Host &&
        mapTo != flangomp::DoConcurrentMappingKind::DCMK_Device) {
      mlir::emitWarning(mlir::UnknownLoc::get(context),
                        "DoConcurrentConversionPass: invalid `map-to` value. "
                        "Valid values are: `host` or `device`");
      return;
    }

    llvm::DenseSet<fir::DoConcurrentOp> concurrentLoopsToSkip;
    CallbackListener callbackListener([&](mlir::Operation *op) {
      if (auto loop = mlir::dyn_cast<fir::DoConcurrentOp>(op))
        concurrentLoopsToSkip.erase(loop);
    });
    mlir::RewritePatternSet patterns(context);
    patterns.insert<DoConcurrentConversion>(
        context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device,
        concurrentLoopsToSkip, moduleSymbolTable);
    mlir::ConversionTarget target(*context);
    target.addDynamicallyLegalOp<fir::DoConcurrentOp>(
        [&](fir::DoConcurrentOp op) {
          return concurrentLoopsToSkip.contains(op);
        });
    target.markUnknownOpDynamicallyLegal(
        [](mlir::Operation *) { return true; });

    mlir::ConversionConfig config;
    config.allowPatternRollback = false;
    config.listener = &callbackListener;
    if (mlir::failed(mlir::applyFullConversion(module, target,
                                               std::move(patterns), config))) {
      signalPassFailure();
    }
  }
};
} // namespace

std::unique_ptr<mlir::Pass>
flangomp::createDoConcurrentConversionPass(bool mapToDevice) {
  DoConcurrentConversionPassOptions options;
  options.mapTo = mapToDevice ? flangomp::DoConcurrentMappingKind::DCMK_Device
                              : flangomp::DoConcurrentMappingKind::DCMK_Host;

  return std::make_unique<DoConcurrentConversionPass>(options);
}