llvm-project/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp

//===- GenericLoopConversion.cpp ------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "flang/Support/OpenMP-utils.h"

#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"

#include <memory>
#include <optional>
#include <type_traits>

namespace flangomp {
#define GEN_PASS_DEF_GENERICLOOPCONVERSIONPASS
#include "flang/Optimizer/OpenMP/Passes.h.inc"
} // namespace flangomp

namespace {

/// A conversion pattern to handle various combined forms of `omp.loop`. For how
/// combined/composite directive are handled see:
/// https://discourse.llvm.org/t/rfc-representing-combined-composite-constructs-in-the-openmp-dialect/76986.
class GenericLoopConversionPattern
    : public mlir::OpConversionPattern<mlir::omp::LoopOp> {
public:
  enum class GenericLoopCombinedInfo { Standalone, TeamsLoop, ParallelLoop };

  using mlir::OpConversionPattern<mlir::omp::LoopOp>::OpConversionPattern;

  explicit GenericLoopConversionPattern(mlir::MLIRContext *ctx)
      : mlir::OpConversionPattern<mlir::omp::LoopOp>{ctx} {
    // Enable rewrite recursion to make sure nested `loop` directives are
    // handled.
    this->setHasBoundedRewriteRecursion(true);
  }

  mlir::LogicalResult
  matchAndRewrite(mlir::omp::LoopOp loopOp, OpAdaptor adaptor,
                  mlir::ConversionPatternRewriter &rewriter) const override {
    assert(mlir::succeeded(checkLoopConversionSupportStatus(loopOp)));

    GenericLoopCombinedInfo combinedInfo = findGenericLoopCombineInfo(loopOp);

    switch (combinedInfo) {
    case GenericLoopCombinedInfo::Standalone:
      rewriteStandaloneLoop(loopOp, rewriter);
      break;
    case GenericLoopCombinedInfo::ParallelLoop:
      rewriteToWsloop(loopOp, rewriter);
      break;
    case GenericLoopCombinedInfo::TeamsLoop:
      if (teamsLoopCanBeParallelFor(loopOp))
        rewriteToDistributeParallelDo(loopOp, rewriter);
      else
        rewriteToDistribute(loopOp, rewriter);
      break;
    }

    rewriter.eraseOp(loopOp);
    return mlir::success();
  }

  static mlir::LogicalResult
  checkLoopConversionSupportStatus(mlir::omp::LoopOp loopOp) {
    auto todo = [&loopOp](mlir::StringRef clauseName) {
      return loopOp.emitError()
             << "not yet implemented: Unhandled clause " << clauseName << " in "
             << loopOp->getName() << " operation";
    };

    if (loopOp.getOrder())
      return todo("order");

    return mlir::success();
  }

private:
  static GenericLoopCombinedInfo
  findGenericLoopCombineInfo(mlir::omp::LoopOp loopOp) {
    mlir::Operation *parentOp = loopOp->getParentOp();
    GenericLoopCombinedInfo result = GenericLoopCombinedInfo::Standalone;

    if (auto teamsOp = mlir::dyn_cast_if_present<mlir::omp::TeamsOp>(parentOp))
      result = GenericLoopCombinedInfo::TeamsLoop;

    if (auto parallelOp =
            mlir::dyn_cast_if_present<mlir::omp::ParallelOp>(parentOp))
      result = GenericLoopCombinedInfo::ParallelLoop;

    return result;
  }

  /// Checks whether a `teams loop` construct can be rewriten to `teams
  /// distribute parallel do` or it has to be converted to `teams distribute`.
  ///
  /// This checks similar constrains to what is checked by `TeamsLoopChecker` in
  /// SemaOpenMP.cpp in clang.
  static bool teamsLoopCanBeParallelFor(mlir::omp::LoopOp loopOp) {
    bool canBeParallelFor =
        !loopOp
             .walk<mlir::WalkOrder::PreOrder>([&](mlir::Operation *nestedOp) {
               if (nestedOp == loopOp)
                 return mlir::WalkResult::advance();

               if (auto nestedLoopOp =
                       mlir::dyn_cast<mlir::omp::LoopOp>(nestedOp)) {
                 GenericLoopCombinedInfo combinedInfo =
                     findGenericLoopCombineInfo(nestedLoopOp);

                 // Worksharing loops cannot be nested inside each other.
                 // Therefore, if the current `loop` directive nests another
                 // `loop` whose `bind` modifier is `parallel`, this `loop`
                 // directive cannot be mapped to `distribute parallel for`
                 // but rather only to `distribute`.
                 if (combinedInfo == GenericLoopCombinedInfo::Standalone &&
                     nestedLoopOp.getBindKind() &&
                     *nestedLoopOp.getBindKind() ==
                         mlir::omp::ClauseBindKind::Parallel)
                   return mlir::WalkResult::interrupt();

                 if (combinedInfo == GenericLoopCombinedInfo::ParallelLoop)
                   return mlir::WalkResult::interrupt();

               } else if (auto callOp =
                              mlir::dyn_cast<mlir::CallOpInterface>(nestedOp)) {
                 // Calls to non-OpenMP API runtime functions inhibits
                 // transformation to `teams distribute parallel do` since the
                 // called functions might have nested parallelism themselves.
                 bool isOpenMPAPI = false;
                 mlir::CallInterfaceCallable callable =
                     callOp.getCallableForCallee();

                 if (auto callableSymRef =
                         mlir::dyn_cast<mlir::SymbolRefAttr>(callable))
                   isOpenMPAPI =
                       callableSymRef.getRootReference().strref().starts_with(
                           "omp_");

                 if (!isOpenMPAPI)
                   return mlir::WalkResult::interrupt();
               }

               return mlir::WalkResult::advance();
             })
             .wasInterrupted();

    return canBeParallelFor;
  }

  void rewriteStandaloneLoop(mlir::omp::LoopOp loopOp,
                             mlir::ConversionPatternRewriter &rewriter) const {
    using namespace mlir::omp;
    std::optional<ClauseBindKind> bindKind = loopOp.getBindKind();

    if (!bindKind.has_value())
      return rewriteToSimdLoop(loopOp, rewriter);

    switch (*loopOp.getBindKind()) {
    case ClauseBindKind::Parallel:
      return rewriteToWsloop(loopOp, rewriter);
    case ClauseBindKind::Teams:
      return rewriteToDistribute(loopOp, rewriter);
    case ClauseBindKind::Thread:
      return rewriteToSimdLoop(loopOp, rewriter);
    }
  }

  /// Rewrites standalone `loop` (without `bind` clause or with
  /// `bind(parallel)`) directives to equivalent `simd` constructs.
  ///
  /// The reasoning behind this decision is that according to the spec (version
  /// 5.2, section 11.7.1):
  ///
  /// "If the bind clause is not specified on a construct for which it may be
  /// specified and the construct is closely nested inside a teams or parallel
  /// construct, the effect is as if binding is teams or parallel. If none of
  /// those conditions hold, the binding region is not defined."
  ///
  /// which means that standalone `loop` directives have undefined binding
  /// region. Moreover, the spec says (in the next paragraph):
  ///
  /// "The specified binding region determines the binding thread set.
  /// Specifically, if the binding region is a teams region, then the binding
  /// thread set is the set of initial threads that are executing that region
  /// while if the binding region is a parallel region, then the binding thread
  /// set is the team of threads that are executing that region. If the binding
  /// region is not defined, then the binding thread set is the encountering
  /// thread."
  ///
  /// which means that the binding thread set for a standalone `loop` directive
  /// is only the encountering thread.
  ///
  /// Since the encountering thread is the binding thread (set) for a
  /// standalone `loop` directive, the best we can do in such case is to "simd"
  /// the directive.
  void rewriteToSimdLoop(mlir::omp::LoopOp loopOp,
                         mlir::ConversionPatternRewriter &rewriter) const {
    loopOp.emitWarning(
        "Detected standalone OpenMP `loop` directive with thread binding, "
        "the associated loop will be rewritten to `simd`.");
    rewriteToSingleWrapperOp<mlir::omp::SimdOp, mlir::omp::SimdOperands>(
        loopOp, rewriter);
  }

  void rewriteToDistribute(mlir::omp::LoopOp loopOp,
                           mlir::ConversionPatternRewriter &rewriter) const {
    assert(loopOp.getReductionVars().empty());
    rewriteToSingleWrapperOp<mlir::omp::DistributeOp,
                             mlir::omp::DistributeOperands>(loopOp, rewriter);
  }

  void rewriteToWsloop(mlir::omp::LoopOp loopOp,
                       mlir::ConversionPatternRewriter &rewriter) const {
    rewriteToSingleWrapperOp<mlir::omp::WsloopOp, mlir::omp::WsloopOperands>(
        loopOp, rewriter);
  }

  // TODO Suggestion by Sergio: tag auto-generated operations for constructs
  // that weren't part of the original program, that would be useful
  // information for debugging purposes later on. This new attribute could be
  // used for `omp.loop`, but also for `do concurrent` transformations,
  // `workshare`, `workdistribute`, etc. The tag could be used for all kinds of
  // auto-generated operations using a dialect attribute (named something like
  // `omp.origin` or `omp.derived`) and perhaps hold the name of the operation
  // it was derived from, the reason it was transformed or something like that
  // we could use when emitting any messages related to it later on.
  template <typename OpTy, typename OpOperandsTy>
  void
  rewriteToSingleWrapperOp(mlir::omp::LoopOp loopOp,
                           mlir::ConversionPatternRewriter &rewriter) const {
    OpOperandsTy clauseOps;
    clauseOps.privateVars = loopOp.getPrivateVars();

    auto privateSyms = loopOp.getPrivateSyms();
    if (privateSyms)
      clauseOps.privateSyms.assign(privateSyms->begin(), privateSyms->end());

    Fortran::common::openmp::EntryBlockArgs args;
    args.priv.vars = clauseOps.privateVars;

    if constexpr (!std::is_same_v<OpOperandsTy,
                                  mlir::omp::DistributeOperands>) {
      populateReductionClauseOps(loopOp, clauseOps);
      args.reduction.vars = clauseOps.reductionVars;
    }

    auto wrapperOp = rewriter.create<OpTy>(loopOp.getLoc(), clauseOps);
    mlir::Block *opBlock = genEntryBlock(rewriter, args, wrapperOp.getRegion());

    mlir::IRMapping mapper;
    mlir::Block &loopBlock = *loopOp.getRegion().begin();

    for (auto [loopOpArg, opArg] :
         llvm::zip_equal(loopBlock.getArguments(), opBlock->getArguments()))
      mapper.map(loopOpArg, opArg);

    rewriter.clone(*loopOp.begin(), mapper);
  }

  void rewriteToDistributeParallelDo(
      mlir::omp::LoopOp loopOp,
      mlir::ConversionPatternRewriter &rewriter) const {
    mlir::omp::ParallelOperands parallelClauseOps;
    parallelClauseOps.privateVars = loopOp.getPrivateVars();

    auto privateSyms = loopOp.getPrivateSyms();
    if (privateSyms)
      parallelClauseOps.privateSyms.assign(privateSyms->begin(),
                                           privateSyms->end());

    Fortran::common::openmp::EntryBlockArgs parallelArgs;
    parallelArgs.priv.vars = parallelClauseOps.privateVars;

    auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loopOp.getLoc(),
                                                             parallelClauseOps);
    genEntryBlock(rewriter, parallelArgs, parallelOp.getRegion());
    parallelOp.setComposite(true);
    rewriter.setInsertionPoint(
        rewriter.create<mlir::omp::TerminatorOp>(loopOp.getLoc()));

    mlir::omp::DistributeOperands distributeClauseOps;
    auto distributeOp = rewriter.create<mlir::omp::DistributeOp>(
        loopOp.getLoc(), distributeClauseOps);
    distributeOp.setComposite(true);
    rewriter.createBlock(&distributeOp.getRegion());

    mlir::omp::WsloopOperands wsloopClauseOps;
    populateReductionClauseOps(loopOp, wsloopClauseOps);
    Fortran::common::openmp::EntryBlockArgs wsloopArgs;
    wsloopArgs.reduction.vars = wsloopClauseOps.reductionVars;

    auto wsloopOp =
        rewriter.create<mlir::omp::WsloopOp>(loopOp.getLoc(), wsloopClauseOps);
    wsloopOp.setComposite(true);
    genEntryBlock(rewriter, wsloopArgs, wsloopOp.getRegion());

    mlir::IRMapping mapper;

    auto loopBlockInterface =
        llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*loopOp);
    auto parallelBlockInterface =
        llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*parallelOp);
    auto wsloopBlockInterface =
        llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*wsloopOp);

    for (auto [loopOpArg, parallelOpArg] :
         llvm::zip_equal(loopBlockInterface.getPrivateBlockArgs(),
                         parallelBlockInterface.getPrivateBlockArgs()))
      mapper.map(loopOpArg, parallelOpArg);

    for (auto [loopOpArg, wsloopOpArg] :
         llvm::zip_equal(loopBlockInterface.getReductionBlockArgs(),
                         wsloopBlockInterface.getReductionBlockArgs()))
      mapper.map(loopOpArg, wsloopOpArg);

    rewriter.clone(*loopOp.begin(), mapper);
  }

  void
  populateReductionClauseOps(mlir::omp::LoopOp loopOp,
                             mlir::omp::ReductionClauseOps &clauseOps) const {
    clauseOps.reductionMod = loopOp.getReductionModAttr();
    clauseOps.reductionVars = loopOp.getReductionVars();

    std::optional<mlir::ArrayAttr> reductionSyms = loopOp.getReductionSyms();
    if (reductionSyms)
      clauseOps.reductionSyms.assign(reductionSyms->begin(),
                                     reductionSyms->end());

    std::optional<llvm::ArrayRef<bool>> reductionByref =
        loopOp.getReductionByref();
    if (reductionByref)
      clauseOps.reductionByref.assign(reductionByref->begin(),
                                      reductionByref->end());
  }
};

/// According to the spec (v5.2, p340, 36):
///
/// ```
/// The effect of the reduction clause is as if it is applied to all leaf
/// constructs that permit the clause, except for the following constructs:
/// * ....
/// * The teams construct, when combined with the loop construct.
/// ```
///
/// Therefore, for a combined directive similar to: `!$omp teams loop
/// reduction(...)`, the earlier stages of the compiler assign the `reduction`
/// clauses only to the `loop` leaf and not to the `teams` leaf.
///
/// On the other hand, if we have a combined construct similar to: `!$omp teams
/// distribute parallel do`, the `reduction` clauses are assigned both to the
/// `teams` and the `do` leaves. We need to match this behavior when we convert
/// `teams` op with a nested `loop` op since the target set of constructs/ops
/// will be incorrect without moving the reductions up to the `teams` op as
/// well.
///
/// This pattern does exactly this. Given the following input:
/// ```
/// omp.teams {
///   omp.loop reduction(@red_sym %red_op -> %red_arg : !fir.ref<i32>) {
///     omp.loop_nest ... {
///       ...
///     }
///   }
/// }
/// ```
/// this pattern updates the `omp.teams` op in-place to:
/// ```
/// omp.teams reduction(@red_sym %red_op -> %teams_red_arg : !fir.ref<i32>) {
///   omp.loop reduction(@red_sym %teams_red_arg -> %red_arg : !fir.ref<i32>) {
///     omp.loop_nest ... {
///       ...
///     }
///   }
/// }
/// ```
///
/// Note the following:
/// * The nested `omp.loop` is not rewritten by this pattern, this happens
///   through `GenericLoopConversionPattern`.
/// * The reduction info are cloned from the nested `omp.loop` op to the parent
///   `omp.teams` op.
/// * The reduction operand of the `omp.loop` op is updated to be the **new**
///   reduction block argument of the `omp.teams` op.
class ReductionsHoistingPattern
    : public mlir::OpConversionPattern<mlir::omp::TeamsOp> {
public:
  using mlir::OpConversionPattern<mlir::omp::TeamsOp>::OpConversionPattern;

  static mlir::omp::LoopOp
  tryToFindNestedLoopWithReduction(mlir::omp::TeamsOp teamsOp) {
    assert(!teamsOp.getRegion().empty() &&
           teamsOp.getRegion().getBlocks().size() == 1);

    mlir::Block &teamsBlock = *teamsOp.getRegion().begin();
    auto loopOpIter = llvm::find_if(teamsBlock, [](mlir::Operation &op) {
      auto nestedLoopOp = llvm::dyn_cast<mlir::omp::LoopOp>(&op);

      if (!nestedLoopOp)
        return false;

      return !nestedLoopOp.getReductionVars().empty();
    });

    if (loopOpIter == teamsBlock.end())
      return nullptr;

    // TODO return error if more than one loop op is nested. We need to
    // coalesce reductions in this case.
    return llvm::cast<mlir::omp::LoopOp>(loopOpIter);
  }

  mlir::LogicalResult
  matchAndRewrite(mlir::omp::TeamsOp teamsOp, OpAdaptor adaptor,
                  mlir::ConversionPatternRewriter &rewriter) const override {
    mlir::omp::LoopOp nestedLoopOp = tryToFindNestedLoopWithReduction(teamsOp);

    rewriter.modifyOpInPlace(teamsOp, [&]() {
      teamsOp.setReductionMod(nestedLoopOp.getReductionMod());
      teamsOp.getReductionVarsMutable().assign(nestedLoopOp.getReductionVars());
      teamsOp.setReductionByref(nestedLoopOp.getReductionByref());
      teamsOp.setReductionSymsAttr(nestedLoopOp.getReductionSymsAttr());

      auto blockArgIface =
          llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*teamsOp);
      unsigned reductionArgsStart = blockArgIface.getPrivateBlockArgsStart() +
                                    blockArgIface.numPrivateBlockArgs();
      llvm::SmallVector<mlir::Value> newLoopOpReductionOperands;

      for (auto [idx, reductionVar] :
           llvm::enumerate(nestedLoopOp.getReductionVars())) {
        mlir::BlockArgument newTeamsOpReductionBlockArg =
            teamsOp.getRegion().insertArgument(reductionArgsStart + idx,
                                               reductionVar.getType(),
                                               reductionVar.getLoc());
        newLoopOpReductionOperands.push_back(newTeamsOpReductionBlockArg);
      }

      nestedLoopOp.getReductionVarsMutable().assign(newLoopOpReductionOperands);
    });

    return mlir::success();
  }
};

class GenericLoopConversionPass
    : public flangomp::impl::GenericLoopConversionPassBase<
          GenericLoopConversionPass> {
public:
  GenericLoopConversionPass() = default;

  void runOnOperation() override {
    mlir::func::FuncOp func = getOperation();

    if (func.isDeclaration())
      return;

    mlir::MLIRContext *context = &getContext();
    mlir::RewritePatternSet patterns(context);
    patterns.insert<ReductionsHoistingPattern, GenericLoopConversionPattern>(
        context);
    mlir::ConversionTarget target(*context);

    target.markUnknownOpDynamicallyLegal(
        [](mlir::Operation *) { return true; });

    target.addDynamicallyLegalOp<mlir::omp::TeamsOp>(
        [](mlir::omp::TeamsOp teamsOp) {
          // If teamsOp's reductions are already populated, then the op is
          // legal. Additionally, the op is legal if it does not nest a LoopOp
          // with reductions.
          return !teamsOp.getReductionVars().empty() ||
                 ReductionsHoistingPattern::tryToFindNestedLoopWithReduction(
                     teamsOp) == nullptr;
        });

    target.addDynamicallyLegalOp<mlir::omp::LoopOp>(
        [](mlir::omp::LoopOp loopOp) {
          return mlir::failed(
              GenericLoopConversionPattern::checkLoopConversionSupportStatus(
                  loopOp));
        });

    if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
                                               std::move(patterns)))) {
      mlir::emitError(func.getLoc(), "error in converting `omp.loop` op");
      signalPassFailure();
    }
  }
};
} // namespace