diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index 9f9dd436dc6a..e62395676a69 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -259,6 +259,7 @@ bool ClauseProcessor::processCollapse( llvm::SmallVectorImpl &iv) const { int64_t numCollapse = collectLoopRelatedInfo(converter, currentLocation, eval, + getNestedDoConstruct(eval), clauses, loopResult, iv); fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); collapseResult.collapseNumLoops = firOpBuilder.getI64IntegerAttr(numCollapse); @@ -518,6 +519,21 @@ bool ClauseProcessor::processSizes(StatementContext &stmtCtx, return false; } +bool ClauseProcessor::processLooprange(StatementContext &stmtCtx, + mlir::omp::LooprangeClauseOps &result, + int64_t &count) const { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + if (auto *clause = findUniqueClause()) { + int64_t first = evaluate::ToInt64(std::get<0>(clause->t)).value(); + count = evaluate::ToInt64(std::get<1>(clause->t)).value(); + result.first = firOpBuilder.getI64IntegerAttr(first); + result.count = firOpBuilder.getI64IntegerAttr(count); + return true; + } + + return false; +} + bool ClauseProcessor::processNumTeams( lower::StatementContext &stmtCtx, mlir::omp::NumTeamsClauseOps &result) const { diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index ca9b28dfdd06..da920407b216 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -68,6 +68,9 @@ public: llvm::SmallVectorImpl &iv) const; bool processSizes(StatementContext &stmtCtx, mlir::omp::SizesClauseOps &result) const; + bool processLooprange(StatementContext &stmtCtx, + mlir::omp::LooprangeClauseOps &result, + int64_t &count) const; bool processDevice(lower::StatementContext &stmtCtx, mlir::omp::DeviceClauseOps &result) const; bool processDeviceType(mlir::omp::DeviceTypeClauseOps &result) const; diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index a958ec9ba503..fcf2ae933729 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -347,7 +347,7 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) { mlir::omp::LoopRelatedClauseOps result; llvm::SmallVector iv; collectLoopRelatedInfo(converter, converter.getCurrentLocation(), eval, - clauses, result, iv); + getNestedDoConstruct(eval), clauses, result, iv); // Update the original variable just before exiting the worksharing // loop. Conversion as follows: diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index f7be823335bd..6d93f245228a 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -2039,12 +2039,27 @@ genLoopOp(lower::AbstractConverter &converter, lower::SymMap &symTable, return loopOp; } +// ´nestedEval´ is the Evaluation of a children loop of ´eval´. +// In a regular OpenMP Construct Evaluation ´nestedEval´ is the only children. +// Can be retrieved with getNestedDoConstruct(Evaluation). +// <> +// Loop +// <> +// +// ´nestedEval´ is most useful in the case that ´eval´ contains a sequence +// of loops. Then this function generates Canonical loop nests for individual +// loops. +// <> +// Loop 1 +// Loop 2 +// <> +// static void genCanonicalLoopNest( lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - mlir::Location loc, const ConstructQueue &queue, - ConstructQueue::const_iterator item, size_t numLoops, - llvm::SmallVectorImpl &loops) { + lower::pft::Evaluation *nestedEval, mlir::Location loc, + const ConstructQueue &queue, ConstructQueue::const_iterator item, + size_t numLoops, llvm::SmallVectorImpl &loops) { assert(loops.empty() && "Expecting empty list to fill"); assert(numLoops >= 1 && "Expecting at least one loop"); @@ -2052,7 +2067,8 @@ static void genCanonicalLoopNest( mlir::omp::LoopRelatedClauseOps loopInfo; llvm::SmallVector ivs; - collectLoopRelatedInfo(converter, loc, eval, numLoops, loopInfo, ivs); + collectLoopRelatedInfo(converter, loc, eval, nestedEval, numLoops, loopInfo, + ivs); assert(ivs.size() == numLoops && "Expected to parse as many loop variables as there are loops"); @@ -2074,7 +2090,7 @@ static void genCanonicalLoopNest( // Step 1: Loop prologues // Computing the trip count must happen before entering the outermost loop - lower::pft::Evaluation *innermostEval = &eval.getFirstNestedEvaluation(); + lower::pft::Evaluation *innermostEval = nestedEval; for ([[maybe_unused]] auto iv : ivs) { if (innermostEval->getIf()->IsDoConcurrent()) { // OpenMP specifies DO CONCURRENT only with the `!omp loop` construct. @@ -2246,8 +2262,9 @@ static void genTileOp(Fortran::lower::AbstractConverter &converter, llvm::SmallVector canonLoops; canonLoops.reserve(numLoops); - genCanonicalLoopNest(converter, symTable, semaCtx, eval, loc, queue, item, - numLoops, canonLoops); + genCanonicalLoopNest(converter, symTable, semaCtx, eval, + getNestedDoConstruct(eval), loc, queue, item, numLoops, + canonLoops); assert((canonLoops.size() == numLoops) && "Expecting the predetermined number of loops"); @@ -2277,6 +2294,50 @@ static void genTileOp(Fortran::lower::AbstractConverter &converter, sizesClause.sizes); } +static void genFuseOp(Fortran::lower::AbstractConverter &converter, + Fortran::lower::SymMap &symTable, + lower::StatementContext &stmtCtx, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, + ConstructQueue::const_iterator item) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + + int64_t count = 0; + mlir::omp::LooprangeClauseOps looprangeClause; + ClauseProcessor cp(converter, semaCtx, item->clauses); + bool looprange = cp.processLooprange(stmtCtx, looprangeClause, count); + + llvm::SmallVector applyees; + for (auto &child : eval.getNestedEvaluations()) { + // Stop at OmpEndLoopDirective + if (&child == &eval.getLastNestedEvaluation()) + break; + // Skip any Compiler Directive + if (child.getIf()) + continue; + + // Emit the associated loop + llvm::SmallVector canonLoops; + genCanonicalLoopNest(converter, symTable, semaCtx, eval, &child, loc, queue, + item, 1, canonLoops); + + auto cli = llvm::getSingleElement(canonLoops).getCli(); + applyees.push_back(cli); + } + // One generated loop + one for each loop not inside the specified looprange + // if present + llvm::SmallVector generatees; + int64_t numGeneratees = !looprange ? 1 : applyees.size() - count + 1; + for (int i = 0; i < numGeneratees; i++) { + auto fusedCLI = mlir::omp::NewCliOp::create(firOpBuilder, loc); + generatees.push_back(fusedCLI); + } + + mlir::omp::FuseOp::create(firOpBuilder, loc, generatees, applyees, + looprangeClause.first, looprangeClause.count); +} + static void genUnrollOp(Fortran::lower::AbstractConverter &converter, Fortran::lower::SymMap &symTable, lower::StatementContext &stmtCtx, @@ -2293,7 +2354,8 @@ static void genUnrollOp(Fortran::lower::AbstractConverter &converter, // Emit the associated loop llvm::SmallVector canonLoops; - genCanonicalLoopNest(converter, symTable, semaCtx, eval, loc, queue, item, 1, + genCanonicalLoopNest(converter, symTable, semaCtx, eval, + getNestedDoConstruct(eval), loc, queue, item, 1, canonLoops); llvm::SmallVector applyees; @@ -3672,13 +3734,9 @@ static void genOMPDispatch(lower::AbstractConverter &converter, case llvm::omp::Directive::OMPD_tile: genTileOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item); break; - case llvm::omp::Directive::OMPD_fuse: { - unsigned version = semaCtx.langOptions().OpenMPVersion; - if (!semaCtx.langOptions().OpenMPSimd) - TODO(loc, "Unhandled loop directive (" + - llvm::omp::getOpenMPDirectiveName(dir, version) + ")"); + case llvm::omp::Directive::OMPD_fuse: + genFuseOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item); break; - } case llvm::omp::Directive::OMPD_unroll: genUnrollOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item); break; diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp index dce858085666..e9ba5f386803 100644 --- a/flang/lib/Lower/OpenMP/Utils.cpp +++ b/flang/lib/Lower/OpenMP/Utils.cpp @@ -836,13 +836,14 @@ void collectTileSizesFromOpenMPConstruct( int64_t collectLoopRelatedInfo( lower::AbstractConverter &converter, mlir::Location currentLocation, - lower::pft::Evaluation &eval, const omp::List &clauses, + lower::pft::Evaluation &eval, lower::pft::Evaluation *nestedEval, + const omp::List &clauses, mlir::omp::LoopRelatedClauseOps &result, llvm::SmallVectorImpl &iv) { int64_t numCollapse = 1; // Collect the loops to collapse. - lower::pft::Evaluation *doConstructEval = getNestedDoConstruct(eval); + lower::pft::Evaluation *doConstructEval = nestedEval; if (doConstructEval->getIf()->IsDoConcurrent()) { TODO(currentLocation, "Do Concurrent in Worksharing loop construct"); } @@ -854,21 +855,21 @@ int64_t collectLoopRelatedInfo( numCollapse = collapseValue; } - collectLoopRelatedInfo(converter, currentLocation, eval, numCollapse, result, - iv); + collectLoopRelatedInfo(converter, currentLocation, eval, nestedEval, + numCollapse, result, iv); return numCollapse; } void collectLoopRelatedInfo( lower::AbstractConverter &converter, mlir::Location currentLocation, - lower::pft::Evaluation &eval, int64_t numCollapse, - mlir::omp::LoopRelatedClauseOps &result, + lower::pft::Evaluation &eval, lower::pft::Evaluation *nestedEval, + int64_t numCollapse, mlir::omp::LoopRelatedClauseOps &result, llvm::SmallVectorImpl &iv) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); // Collect the loops to collapse. - lower::pft::Evaluation *doConstructEval = getNestedDoConstruct(eval); + lower::pft::Evaluation *doConstructEval = nestedEval; if (doConstructEval->getIf()->IsDoConcurrent()) { TODO(currentLocation, "Do Concurrent in Worksharing loop construct"); } diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h index 8a68ff8bd3bd..f70755719784 100644 --- a/flang/lib/Lower/OpenMP/Utils.h +++ b/flang/lib/Lower/OpenMP/Utils.h @@ -171,13 +171,15 @@ pft::Evaluation *getNestedDoConstruct(pft::Evaluation &eval); int64_t collectLoopRelatedInfo( lower::AbstractConverter &converter, mlir::Location currentLocation, - lower::pft::Evaluation &eval, const omp::List &clauses, + lower::pft::Evaluation &eval, lower::pft::Evaluation *nestedEval, + const omp::List &clauses, mlir::omp::LoopRelatedClauseOps &result, llvm::SmallVectorImpl &iv); void collectLoopRelatedInfo( lower::AbstractConverter &converter, mlir::Location currentLocation, - lower::pft::Evaluation &eval, std::int64_t collapseValue, + lower::pft::Evaluation &eval, lower::pft::Evaluation *nestedEval, + std::int64_t collapseValue, // const omp::List &clauses, mlir::omp::LoopRelatedClauseOps &result, llvm::SmallVectorImpl &iv); diff --git a/flang/test/Lower/OpenMP/fuse01.f90 b/flang/test/Lower/OpenMP/fuse01.f90 new file mode 100644 index 000000000000..1377bf3e9c52 --- /dev/null +++ b/flang/test/Lower/OpenMP/fuse01.f90 @@ -0,0 +1,93 @@ +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=60 -o - %s | FileCheck %s + + +subroutine omp_fuse01(lb1, ub1, inc1, lb2, ub2, inc2) + integer res, i, j + integer lb1, ub1, inc1 + integer lb2, ub2, inc2 + + !$omp fuse + do i = lb1, ub1, inc1 + res = i + end do + do j = lb2, ub2, inc2 + res = j + end do + !$omp end fuse + +end subroutine omp_fuse01 + + +! CHECK-LABEL: func.func @_QPomp_fuse01( +! CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "lb1"}, +! CHECK-SAME: %[[ARG1:.*]]: !fir.ref {fir.bindc_name = "ub1"}, +! CHECK-SAME: %[[ARG2:.*]]: !fir.ref {fir.bindc_name = "inc1"}, +! CHECK-SAME: %[[ARG3:.*]]: !fir.ref {fir.bindc_name = "lb2"}, +! CHECK-SAME: %[[ARG4:.*]]: !fir.ref {fir.bindc_name = "ub2"}, +! CHECK-SAME: %[[ARG5:.*]]: !fir.ref {fir.bindc_name = "inc2"}) { +! CHECK: %[[DUMMY_SCOPE_0:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_fuse01Ei"} +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "_QFomp_fuse01Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Einc1"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_2:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Einc2"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[ALLOCA_1:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFomp_fuse01Ej"} +! CHECK: %[[DECLARE_3:.*]]:2 = hlfir.declare %[[ALLOCA_1]] {uniq_name = "_QFomp_fuse01Ej"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Elb1"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_5:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Elb2"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[ALLOCA_2:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_fuse01Eres"} +! CHECK: %[[DECLARE_6:.*]]:2 = hlfir.declare %[[ALLOCA_2]] {uniq_name = "_QFomp_fuse01Eres"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Eub1"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_8:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Eub2"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DECLARE_4]]#0 : !fir.ref +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DECLARE_7]]#0 : !fir.ref +! CHECK: %[[LOAD_2:.*]] = fir.load %[[DECLARE_1]]#0 : !fir.ref +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 1 : i32 +! CHECK: %[[CMPI_0:.*]] = arith.cmpi slt, %[[LOAD_2]], %[[CONSTANT_0]] : i32 +! CHECK: %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_0]], %[[LOAD_2]] : i32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[SUBI_0]], %[[LOAD_2]] : i32 +! CHECK: %[[SELECT_1:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: %[[SELECT_2:.*]] = arith.select %[[CMPI_0]], %[[LOAD_0]], %[[LOAD_1]] : i32 +! CHECK: %[[SUBI_1:.*]] = arith.subi %[[SELECT_2]], %[[SELECT_1]] overflow : i32 +! CHECK: %[[DIVUI_0:.*]] = arith.divui %[[SUBI_1]], %[[SELECT_0]] : i32 +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[DIVUI_0]], %[[CONSTANT_1]] overflow : i32 +! CHECK: %[[CMPI_1:.*]] = arith.cmpi slt, %[[SELECT_2]], %[[SELECT_1]] : i32 +! CHECK: %[[SELECT_3:.*]] = arith.select %[[CMPI_1]], %[[CONSTANT_0]], %[[ADDI_0]] : i32 +! CHECK: %[[NEW_CLI_0:.*]] = omp.new_cli +! CHECK: omp.canonical_loop(%[[NEW_CLI_0]]) %[[VAL_0:.*]] : i32 in range(%[[SELECT_3]]) { +! CHECK: %[[MULI_0:.*]] = arith.muli %[[VAL_0]], %[[LOAD_2]] : i32 +! CHECK: %[[ADDI_1:.*]] = arith.addi %[[LOAD_0]], %[[MULI_0]] : i32 +! CHECK: hlfir.assign %[[ADDI_1]] to %[[DECLARE_0]]#0 : i32, !fir.ref +! CHECK: %[[LOAD_3:.*]] = fir.load %[[DECLARE_0]]#0 : !fir.ref +! CHECK: hlfir.assign %[[LOAD_3]] to %[[DECLARE_6]]#0 : i32, !fir.ref +! CHECK: omp.terminator +! CHECK: } +! CHECK: %[[LOAD_4:.*]] = fir.load %[[DECLARE_5]]#0 : !fir.ref +! CHECK: %[[LOAD_5:.*]] = fir.load %[[DECLARE_8]]#0 : !fir.ref +! CHECK: %[[LOAD_6:.*]] = fir.load %[[DECLARE_2]]#0 : !fir.ref +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 1 : i32 +! CHECK: %[[CMPI_2:.*]] = arith.cmpi slt, %[[LOAD_6]], %[[CONSTANT_2]] : i32 +! CHECK: %[[SUBI_2:.*]] = arith.subi %[[CONSTANT_2]], %[[LOAD_6]] : i32 +! CHECK: %[[SELECT_4:.*]] = arith.select %[[CMPI_2]], %[[SUBI_2]], %[[LOAD_6]] : i32 +! CHECK: %[[SELECT_5:.*]] = arith.select %[[CMPI_2]], %[[LOAD_5]], %[[LOAD_4]] : i32 +! CHECK: %[[SELECT_6:.*]] = arith.select %[[CMPI_2]], %[[LOAD_4]], %[[LOAD_5]] : i32 +! CHECK: %[[SUBI_3:.*]] = arith.subi %[[SELECT_6]], %[[SELECT_5]] overflow : i32 +! CHECK: %[[DIVUI_1:.*]] = arith.divui %[[SUBI_3]], %[[SELECT_4]] : i32 +! CHECK: %[[ADDI_2:.*]] = arith.addi %[[DIVUI_1]], %[[CONSTANT_3]] overflow : i32 +! CHECK: %[[CMPI_3:.*]] = arith.cmpi slt, %[[SELECT_6]], %[[SELECT_5]] : i32 +! CHECK: %[[SELECT_7:.*]] = arith.select %[[CMPI_3]], %[[CONSTANT_2]], %[[ADDI_2]] : i32 +! CHECK: %[[NEW_CLI_1:.*]] = omp.new_cli +! CHECK: omp.canonical_loop(%[[NEW_CLI_1]]) %[[VAL_1:.*]] : i32 in range(%[[SELECT_7]]) { +! CHECK: %[[MULI_1:.*]] = arith.muli %[[VAL_1]], %[[LOAD_6]] : i32 +! CHECK: %[[ADDI_3:.*]] = arith.addi %[[LOAD_4]], %[[MULI_1]] : i32 +! CHECK: hlfir.assign %[[ADDI_3]] to %[[DECLARE_3]]#0 : i32, !fir.ref +! CHECK: %[[LOAD_7:.*]] = fir.load %[[DECLARE_3]]#0 : !fir.ref +! CHECK: hlfir.assign %[[LOAD_7]] to %[[DECLARE_6]]#0 : i32, !fir.ref +! CHECK: omp.terminator +! CHECK: } +! CHECK: %[[NEW_CLI_2:.*]] = omp.new_cli +! CHECK: omp.fuse (%[[NEW_CLI_2]]) <- (%[[NEW_CLI_0]], %[[NEW_CLI_1]]) +! CHECK: return +! CHECK: } + diff --git a/flang/test/Lower/OpenMP/fuse02.f90 b/flang/test/Lower/OpenMP/fuse02.f90 new file mode 100644 index 000000000000..5a5900a4dafc --- /dev/null +++ b/flang/test/Lower/OpenMP/fuse02.f90 @@ -0,0 +1,123 @@ +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=60 -o - %s | FileCheck %s + + +subroutine omp_fuse02(lb1, ub1, inc1, lb2, ub2, inc2) + integer res, i, j, k + integer lb1, ub1, inc1 + integer lb2, ub2, inc2 + + !$omp fuse looprange(2,2) + do i = lb1, ub1, inc1 + res = i + end do + do j = lb2, ub2, inc2 + res = j + end do + do k = lb1, ub2, inc1 + res = k + end do + !$omp end fuse + +end subroutine omp_fuse02 + + +! CHECK-LABEL: func.func @_QPomp_fuse02( +! CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "lb1"}, +! CHECK-SAME: %[[ARG1:.*]]: !fir.ref {fir.bindc_name = "ub1"}, +! CHECK-SAME: %[[ARG2:.*]]: !fir.ref {fir.bindc_name = "inc1"}, +! CHECK-SAME: %[[ARG3:.*]]: !fir.ref {fir.bindc_name = "lb2"}, +! CHECK-SAME: %[[ARG4:.*]]: !fir.ref {fir.bindc_name = "ub2"}, +! CHECK-SAME: %[[ARG5:.*]]: !fir.ref {fir.bindc_name = "inc2"}) { +! CHECK: %[[DUMMY_SCOPE_0:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_fuse02Ei"} +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "_QFomp_fuse02Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Einc1"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_2:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Einc2"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[ALLOCA_1:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFomp_fuse02Ej"} +! CHECK: %[[DECLARE_3:.*]]:2 = hlfir.declare %[[ALLOCA_1]] {uniq_name = "_QFomp_fuse02Ej"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[ALLOCA_2:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFomp_fuse02Ek"} +! CHECK: %[[DECLARE_4:.*]]:2 = hlfir.declare %[[ALLOCA_2]] {uniq_name = "_QFomp_fuse02Ek"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Elb1"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_6:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Elb2"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[ALLOCA_3:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_fuse02Eres"} +! CHECK: %[[DECLARE_7:.*]]:2 = hlfir.declare %[[ALLOCA_3]] {uniq_name = "_QFomp_fuse02Eres"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_8:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Eub1"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_9:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Eub2"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DECLARE_5]]#0 : !fir.ref +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DECLARE_8]]#0 : !fir.ref +! CHECK: %[[LOAD_2:.*]] = fir.load %[[DECLARE_1]]#0 : !fir.ref +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 1 : i32 +! CHECK: %[[CMPI_0:.*]] = arith.cmpi slt, %[[LOAD_2]], %[[CONSTANT_0]] : i32 +! CHECK: %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_0]], %[[LOAD_2]] : i32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[SUBI_0]], %[[LOAD_2]] : i32 +! CHECK: %[[SELECT_1:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: %[[SELECT_2:.*]] = arith.select %[[CMPI_0]], %[[LOAD_0]], %[[LOAD_1]] : i32 +! CHECK: %[[SUBI_1:.*]] = arith.subi %[[SELECT_2]], %[[SELECT_1]] overflow : i32 +! CHECK: %[[DIVUI_0:.*]] = arith.divui %[[SUBI_1]], %[[SELECT_0]] : i32 +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[DIVUI_0]], %[[CONSTANT_1]] overflow : i32 +! CHECK: %[[CMPI_1:.*]] = arith.cmpi slt, %[[SELECT_2]], %[[SELECT_1]] : i32 +! CHECK: %[[SELECT_3:.*]] = arith.select %[[CMPI_1]], %[[CONSTANT_0]], %[[ADDI_0]] : i32 +! CHECK: %[[NEW_CLI_0:.*]] = omp.new_cli +! CHECK: omp.canonical_loop(%[[NEW_CLI_0]]) %[[VAL_0:.*]] : i32 in range(%[[SELECT_3]]) { +! CHECK: %[[MULI_0:.*]] = arith.muli %[[VAL_0]], %[[LOAD_2]] : i32 +! CHECK: %[[ADDI_1:.*]] = arith.addi %[[LOAD_0]], %[[MULI_0]] : i32 +! CHECK: hlfir.assign %[[ADDI_1]] to %[[DECLARE_0]]#0 : i32, !fir.ref +! CHECK: %[[LOAD_3:.*]] = fir.load %[[DECLARE_0]]#0 : !fir.ref +! CHECK: hlfir.assign %[[LOAD_3]] to %[[DECLARE_7]]#0 : i32, !fir.ref +! CHECK: omp.terminator +! CHECK: } +! CHECK: %[[LOAD_4:.*]] = fir.load %[[DECLARE_6]]#0 : !fir.ref +! CHECK: %[[LOAD_5:.*]] = fir.load %[[DECLARE_9]]#0 : !fir.ref +! CHECK: %[[LOAD_6:.*]] = fir.load %[[DECLARE_2]]#0 : !fir.ref +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 1 : i32 +! CHECK: %[[CMPI_2:.*]] = arith.cmpi slt, %[[LOAD_6]], %[[CONSTANT_2]] : i32 +! CHECK: %[[SUBI_2:.*]] = arith.subi %[[CONSTANT_2]], %[[LOAD_6]] : i32 +! CHECK: %[[SELECT_4:.*]] = arith.select %[[CMPI_2]], %[[SUBI_2]], %[[LOAD_6]] : i32 +! CHECK: %[[SELECT_5:.*]] = arith.select %[[CMPI_2]], %[[LOAD_5]], %[[LOAD_4]] : i32 +! CHECK: %[[SELECT_6:.*]] = arith.select %[[CMPI_2]], %[[LOAD_4]], %[[LOAD_5]] : i32 +! CHECK: %[[SUBI_3:.*]] = arith.subi %[[SELECT_6]], %[[SELECT_5]] overflow : i32 +! CHECK: %[[DIVUI_1:.*]] = arith.divui %[[SUBI_3]], %[[SELECT_4]] : i32 +! CHECK: %[[ADDI_2:.*]] = arith.addi %[[DIVUI_1]], %[[CONSTANT_3]] overflow : i32 +! CHECK: %[[CMPI_3:.*]] = arith.cmpi slt, %[[SELECT_6]], %[[SELECT_5]] : i32 +! CHECK: %[[SELECT_7:.*]] = arith.select %[[CMPI_3]], %[[CONSTANT_2]], %[[ADDI_2]] : i32 +! CHECK: %[[NEW_CLI_1:.*]] = omp.new_cli +! CHECK: omp.canonical_loop(%[[NEW_CLI_1]]) %[[VAL_1:.*]] : i32 in range(%[[SELECT_7]]) { +! CHECK: %[[MULI_1:.*]] = arith.muli %[[VAL_1]], %[[LOAD_6]] : i32 +! CHECK: %[[ADDI_3:.*]] = arith.addi %[[LOAD_4]], %[[MULI_1]] : i32 +! CHECK: hlfir.assign %[[ADDI_3]] to %[[DECLARE_3]]#0 : i32, !fir.ref +! CHECK: %[[LOAD_7:.*]] = fir.load %[[DECLARE_3]]#0 : !fir.ref +! CHECK: hlfir.assign %[[LOAD_7]] to %[[DECLARE_7]]#0 : i32, !fir.ref +! CHECK: omp.terminator +! CHECK: } +! CHECK: %[[LOAD_8:.*]] = fir.load %[[DECLARE_5]]#0 : !fir.ref +! CHECK: %[[LOAD_9:.*]] = fir.load %[[DECLARE_9]]#0 : !fir.ref +! CHECK: %[[LOAD_10:.*]] = fir.load %[[DECLARE_1]]#0 : !fir.ref +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : i32 +! CHECK: %[[CMPI_4:.*]] = arith.cmpi slt, %[[LOAD_10]], %[[CONSTANT_4]] : i32 +! CHECK: %[[SUBI_4:.*]] = arith.subi %[[CONSTANT_4]], %[[LOAD_10]] : i32 +! CHECK: %[[SELECT_8:.*]] = arith.select %[[CMPI_4]], %[[SUBI_4]], %[[LOAD_10]] : i32 +! CHECK: %[[SELECT_9:.*]] = arith.select %[[CMPI_4]], %[[LOAD_9]], %[[LOAD_8]] : i32 +! CHECK: %[[SELECT_10:.*]] = arith.select %[[CMPI_4]], %[[LOAD_8]], %[[LOAD_9]] : i32 +! CHECK: %[[SUBI_5:.*]] = arith.subi %[[SELECT_10]], %[[SELECT_9]] overflow : i32 +! CHECK: %[[DIVUI_2:.*]] = arith.divui %[[SUBI_5]], %[[SELECT_8]] : i32 +! CHECK: %[[ADDI_4:.*]] = arith.addi %[[DIVUI_2]], %[[CONSTANT_5]] overflow : i32 +! CHECK: %[[CMPI_5:.*]] = arith.cmpi slt, %[[SELECT_10]], %[[SELECT_9]] : i32 +! CHECK: %[[SELECT_11:.*]] = arith.select %[[CMPI_5]], %[[CONSTANT_4]], %[[ADDI_4]] : i32 +! CHECK: %[[NEW_CLI_2:.*]] = omp.new_cli +! CHECK: omp.canonical_loop(%[[NEW_CLI_2]]) %[[VAL_2:.*]] : i32 in range(%[[SELECT_11]]) { +! CHECK: %[[MULI_2:.*]] = arith.muli %[[VAL_2]], %[[LOAD_10]] : i32 +! CHECK: %[[ADDI_5:.*]] = arith.addi %[[LOAD_8]], %[[MULI_2]] : i32 +! CHECK: hlfir.assign %[[ADDI_5]] to %[[DECLARE_4]]#0 : i32, !fir.ref +! CHECK: %[[LOAD_11:.*]] = fir.load %[[DECLARE_4]]#0 : !fir.ref +! CHECK: hlfir.assign %[[LOAD_11]] to %[[DECLARE_7]]#0 : i32, !fir.ref +! CHECK: omp.terminator +! CHECK: } +! CHECK: %[[NEW_CLI_3:.*]] = omp.new_cli +! CHECK: %[[NEW_CLI_4:.*]] = omp.new_cli +! CHECK: omp.fuse (%[[NEW_CLI_3]], %[[NEW_CLI_4]]) <- (%[[NEW_CLI_0]], %[[NEW_CLI_1]], %[[NEW_CLI_2]]) looprange(first = 2, count = 2) +! CHECK: return +! CHECK: } + diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index f73e10c97e64..9885ffc8b206 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1370,6 +1370,59 @@ public: tileLoops(DebugLoc DL, ArrayRef Loops, ArrayRef TileSizes); + /// Fuse a sequence of loops. + /// + /// Fuses the loops of \p Loops. + /// The merging of the loops is done in the following structure: + /// + /// Example: + /// \code + /// for (int i = lb0; i < ub0; i += st0) // trip count is calculated as: + /// body(i) // tc0 = (ub0 - lb0 + st0) / st0 + /// for (int j = lb1; j < ub1; j += st1) + /// body(j); + /// + /// ... + /// + /// for (int k = lbk; j < ubk; j += stk) + /// body(k); + /// \endcode + /// + /// After fusing the loops a single loop is left: + /// \code + /// for (fuse.index = 0; fuse.index < max(tc0, tc1, ... tck); ++fuse.index) { + /// if (fuse.index < tc0){ + /// iv0 = lb0 + st0 * fuse.index; + /// original.index0 = iv0 + /// body(0); + /// } + /// if (fuse.index < tc1){ + /// iv1 = lb1 + st1 * fuse.index; + /// original.index1 = iv1 + /// body(1); + /// } + /// + /// ... + /// + /// if (fuse.index < tck){ + /// ivk = lbk + stk * fuse.index; + /// original.indexk = ivk + /// body(k); + /// } + /// } + /// \endcode + /// + /// + /// @param DL Debug location for instructions added by fusion. + /// + /// @param Loops Loops to fuse. The CanonicalLoopInfo objects are + /// invalidated by this method, i.e. should not used after + /// fusion. + /// + /// \returns A single loop generated by the loop fusion + LLVM_ABI CanonicalLoopInfo *fuseLoops(DebugLoc DL, + ArrayRef Loops); + /// Fully unroll a loop. /// /// Instead of unrolling the loop immediately (and duplicating its body diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 1947323ef85f..25f4da7c90d9 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -6636,6 +6636,116 @@ static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, } } +CanonicalLoopInfo * +OpenMPIRBuilder::fuseLoops(DebugLoc DL, ArrayRef Loops) { + CanonicalLoopInfo *firstLoop = Loops.front(); + CanonicalLoopInfo *lastLoop = Loops.back(); + Function *F = firstLoop->getPreheader()->getParent(); + + // Loop control blocks that will become orphaned later + SmallVector oldControlBBs; + for (CanonicalLoopInfo *Loop : Loops) + Loop->collectControlBlocks(oldControlBBs); + + // Collect original trip counts + SmallVector origTripCounts; + for (CanonicalLoopInfo *L : Loops) { + assert(L->isValid() && "All input loops must be valid canonical loops"); + origTripCounts.push_back(L->getTripCount()); + } + + Builder.SetCurrentDebugLocation(DL); + + // Compute max trip count. + // The fused loop will be from 0 to max(origTripCounts) + BasicBlock *TCBlock = BasicBlock::Create(F->getContext(), "omp.fuse.comp.tc", + F, firstLoop->getHeader()); + Builder.SetInsertPoint(TCBlock); + Value *fusedTripCount = nullptr; + for (CanonicalLoopInfo *L : Loops) { + assert(L->isValid() && "All loops to fuse must be valid canonical loops"); + Value *origTripCount = L->getTripCount(); + if (!fusedTripCount) { + fusedTripCount = origTripCount; + continue; + } + Value *condTP = Builder.CreateICmpSGT(fusedTripCount, origTripCount); + fusedTripCount = Builder.CreateSelect(condTP, fusedTripCount, origTripCount, + ".omp.fuse.tc"); + } + + // Generate new loop + CanonicalLoopInfo *fused = + createLoopSkeleton(DL, fusedTripCount, F, firstLoop->getBody(), + lastLoop->getLatch(), "fused"); + + // Replace original loops with the fused loop + // Preheader and After are not considered inside the CLI. + // These are used to compute the individual TCs of the loops + // so they have to be put before the resulting fused loop. + // Moving them up for readability. + for (size_t i = 0; i < Loops.size() - 1; ++i) { + Loops[i]->getPreheader()->moveBefore(TCBlock); + Loops[i]->getAfter()->moveBefore(TCBlock); + } + lastLoop->getPreheader()->moveBefore(TCBlock); + + for (size_t i = 0; i < Loops.size() - 1; ++i) { + redirectTo(Loops[i]->getPreheader(), Loops[i]->getAfter(), DL); + redirectTo(Loops[i]->getAfter(), Loops[i + 1]->getPreheader(), DL); + } + redirectTo(lastLoop->getPreheader(), TCBlock, DL); + redirectTo(TCBlock, fused->getPreheader(), DL); + redirectTo(fused->getAfter(), lastLoop->getAfter(), DL); + + // Build the fused body + // Create new Blocks with conditions that jump to the original loop bodies + SmallVector condBBs; + SmallVector condValues; + for (size_t i = 0; i < Loops.size(); ++i) { + BasicBlock *condBlock = BasicBlock::Create( + F->getContext(), "omp.fused.inner.cond", F, Loops[i]->getBody()); + Builder.SetInsertPoint(condBlock); + Value *condValue = + Builder.CreateICmpSLT(fused->getIndVar(), origTripCounts[i]); + condBBs.push_back(condBlock); + condValues.push_back(condValue); + } + // Join the condition blocks with the bodies of the original loops + redirectTo(fused->getBody(), condBBs[0], DL); + for (size_t i = 0; i < Loops.size() - 1; ++i) { + Builder.SetInsertPoint(condBBs[i]); + Builder.CreateCondBr(condValues[i], Loops[i]->getBody(), condBBs[i + 1]); + redirectAllPredecessorsTo(Loops[i]->getLatch(), condBBs[i + 1], DL); + // Replace the IV with the fused IV + Loops[i]->getIndVar()->replaceAllUsesWith(fused->getIndVar()); + } + // Last body jumps to the created end body block + Builder.SetInsertPoint(condBBs.back()); + Builder.CreateCondBr(condValues.back(), lastLoop->getBody(), + fused->getLatch()); + redirectAllPredecessorsTo(lastLoop->getLatch(), fused->getLatch(), DL); + // Replace the IV with the fused IV + lastLoop->getIndVar()->replaceAllUsesWith(fused->getIndVar()); + + // The loop latch must have only one predecessor. Currently it is branched to + // from both the last condition block and the last loop body + fused->getLatch()->splitBasicBlockBefore(fused->getLatch()->begin(), + "omp.fused.pre_latch"); + + // Remove unused parts + removeUnusedBlocksFromParent(oldControlBBs); + + // Invalidate old CLIs + for (CanonicalLoopInfo *L : Loops) + L->invalidate(); + +#ifndef NDEBUG + fused->assertOK(); +#endif + return fused; +} + void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) { LLVMContext &Ctx = Builder.getContext(); addLoopMetadata( diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td index 286b07941a03..ba52e52ebf58 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td @@ -1120,6 +1120,32 @@ class OpenMP_SizesClauseSkip< def OpenMP_SizesClause : OpenMP_SizesClauseSkip<>; +//===----------------------------------------------------------------------===// +// V6.0 `looprange` clause +//===----------------------------------------------------------------------===// + +class OpenMP_LooprangeClauseSkip< + bit traits = false, bit arguments = false, bit assemblyFormat = false, + bit description = false, bit extraClassDeclaration = false> + : OpenMP_Clause { + let arguments = (ins OptionalAttr:$first, + OptionalAttr:$count); + + let optAssemblyFormat = [{ + `looprange` `(` `first` `=` $first `,` `count` `=` $count `)` + }]; + + let description = [{ + The `looprange` clause contains a range that represent the loops affected + by a loop fusion. The `first` attribute is the first loop of the sequence + that will be affected and the `count` attribute is the number of loops that + are affected by the loop fusion. + }]; +} + +def OpenMP_LooprangeClause : OpenMP_LooprangeClauseSkip<>; + //===----------------------------------------------------------------------===// // V5.2: [10.1.2] `num_threads` clause //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 49a724fd5446..0f51b08f87dc 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -550,6 +550,33 @@ def TileOp : OpenMPTransformBase_Op<"tile", let hasVerifier = 1; } +//===----------------------------------------------------------------------===// +// OpenMP fuse operation +//===----------------------------------------------------------------------===// + +def FuseOp + : OpenMPTransformBase_Op<"fuse", clauses = [OpenMP_LooprangeClause]> { + let summary = "OpenMP fuse operation"; + let description = [{ + Represents the OpenMP fuse directive introduced in OpenMP 6.0. + + The construct takes a loop sequence and merges the loops specifed by the + `looprange` clause and generates a loop sequence with the loops before the + `first` attribute untouched, the generated fused loop, and the loops after + the the `first` + `count` attributes untouched mantaining the orignal + order. If the `looprange` clause is not present all the loops in the + sequence are fused generating a single loop. + Each logical iteration of the fused loop executes a logical iteration of + each affected loop. The fused loop has the number of logical iterations + equal to the affected loop with most logical iterations. + + The `first` and `count` attributes of the `looprange` clause are constant + and known beforehand if present. + }]#clausesDescription; + + let hasVerifier = 1; +} + //===----------------------------------------------------------------------===// // 2.8.3 Workshare Construct //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index e8eebc2a1a4c..601c970bc8a6 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -3455,6 +3455,15 @@ void NewCliOp::getAsmResultNames(OpAsmSetValueNameFn setNameFn) { .Case([&](UnrollHeuristicOp op) -> std::string { llvm_unreachable("heuristic unrolling does not generate a loop"); }) + .Case([&](FuseOp op) -> std::string { + unsigned opnum = generator->getOperandNumber(); + // The position of the first loop to be fused is the same position + // as the resulting fused loop + if (op.getFirst().has_value() && opnum != op.getFirst().value()) + return "canonloop_fuse"; + else + return "fused"; + }) .Case([&](TileOp op) -> std::string { auto [generateesFirst, generateesCount] = op.getGenerateesODSOperandIndexAndLength(); @@ -3830,6 +3839,60 @@ std::pair TileOp::getGenerateesODSOperandIndexAndLength() { return getODSOperandIndexAndLength(odsIndex_generatees); } +//===----------------------------------------------------------------------===// +// FuseOp +//===----------------------------------------------------------------------===// + +static void printLoopTransformClis(OpAsmPrinter &p, FuseOp op, + OperandRange generatees, + OperandRange applyees) { + if (!generatees.empty()) + p << '(' << llvm::interleaved(generatees) << ')'; + + if (!applyees.empty()) + p << " <- (" << llvm::interleaved(applyees) << ')'; +} + +LogicalResult FuseOp::verify() { + if (getApplyees().size() < 2) + return emitOpError() << "must apply to at least two loops"; + + if (getFirst().has_value() && getCount().has_value()) { + int64_t first = getFirst().value(); + int64_t count = getCount().value(); + if ((unsigned)(first + count - 1) > getApplyees().size()) + return emitOpError() << "the numbers of applyees must be at least first " + "minus one plus count attributes"; + if (!getGeneratees().empty() && + getGeneratees().size() != getApplyees().size() + 1 - count) + return emitOpError() << "the number of generatees must be the number of " + "aplyees plus one minus count"; + + } else { + if (!getGeneratees().empty() && getGeneratees().size() != 1) + return emitOpError() + << "in a complete fuse the number of generatees must be exactly 1"; + } + for (auto &&applyee : getApplyees()) { + auto [create, gen, cons] = decodeCli(applyee); + + if (!gen) + return emitOpError() << "applyee CLI has no generator"; + auto loop = dyn_cast_or_null(gen->getOwner()); + if (!loop) + return emitOpError() + << "currently only supports omp.canonical_loop as applyee"; + } + return success(); +} +std::pair FuseOp::getApplyeesODSOperandIndexAndLength() { + return getODSOperandIndexAndLength(odsIndex_applyees); +} + +std::pair FuseOp::getGenerateesODSOperandIndexAndLength() { + return getODSOperandIndexAndLength(odsIndex_generatees); +} + //===----------------------------------------------------------------------===// // Critical construct (2.17.1) //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index bbde9f3b9071..38c5802ed60e 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -3794,6 +3794,51 @@ static LogicalResult applyTile(omp::TileOp op, llvm::IRBuilderBase &builder, return success(); } +/// Apply a `#pragma omp fuse` / `!$omp fuse` transformation using the +/// OpenMPIRBuilder. +static LogicalResult applyFuse(omp::FuseOp op, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + llvm::OpenMPIRBuilder::LocationDescription loc(builder); + + // Select what CLIs are going to be fused + SmallVector beforeFuse, toFuse, afterFuse; + for (size_t i = 0; i < op.getApplyees().size(); i++) { + Value applyee = op.getApplyees()[i]; + llvm::CanonicalLoopInfo *consBuilderCLI = + moduleTranslation.lookupOMPLoop(applyee); + assert(applyee && "Canonical loop must already been translated"); + if (op.getFirst().has_value() && i < op.getFirst().value() - 1) + beforeFuse.push_back(consBuilderCLI); + else if (op.getCount().has_value() && + i >= op.getFirst().value() + op.getCount().value() - 1) + afterFuse.push_back(consBuilderCLI); + else + toFuse.push_back(consBuilderCLI); + } + assert( + (op.getGeneratees().empty() || + beforeFuse.size() + afterFuse.size() + 1 == op.getGeneratees().size()) && + "Wrong number of generatees"); + + // do the fuse + auto generatedLoop = ompBuilder->fuseLoops(loc.DL, toFuse); + if (!op.getGeneratees().empty()) { + size_t i = 0; + for (; i < beforeFuse.size(); i++) + moduleTranslation.mapOmpLoop(op.getGeneratees()[i], beforeFuse[i]); + moduleTranslation.mapOmpLoop(op.getGeneratees()[i++], generatedLoop); + for (; i < afterFuse.size(); i++) + moduleTranslation.mapOmpLoop(op.getGeneratees()[i], afterFuse[i]); + } + + // CLIs can only be consumed once + for (Value applyee : op.getApplyees()) + moduleTranslation.invalidateOmpLoop(applyee); + + return success(); +} + /// Convert an Atomic Ordering attribute to llvm::AtomicOrdering. static llvm::AtomicOrdering convertAtomicOrdering(std::optional ao) { @@ -7271,6 +7316,9 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation( .Case([&](omp::TileOp op) { return applyTile(op, builder, moduleTranslation); }) + .Case([&](omp::FuseOp op) { + return applyFuse(op, builder, moduleTranslation); + }) .Case([&](omp::TargetAllocMemOp) { return convertTargetAllocMemOp(*op, builder, moduleTranslation); }) diff --git a/mlir/test/Dialect/OpenMP/cli-fuse.mlir b/mlir/test/Dialect/OpenMP/cli-fuse.mlir new file mode 100644 index 000000000000..df96810aadb4 --- /dev/null +++ b/mlir/test/Dialect/OpenMP/cli-fuse.mlir @@ -0,0 +1,114 @@ +// RUN: mlir-opt %s | FileCheck %s --enable-var-scope +// RUN: mlir-opt %s | mlir-opt | FileCheck %s --enable-var-scope + + +// Raw syntax check (MLIR output is always pretty-printed) +// CHECK-LABEL: @omp_fuse_raw( +// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32) { +func.func @omp_fuse_raw(%tc1 : i32, %tc2 : i32) -> () { + // CHECK-NEXT: %canonloop_s0 = omp.new_cli + %canonloop_s0 = "omp.new_cli" () : () -> (!omp.cli) + // CHECK-NEXT: %canonloop_s1 = omp.new_cli + %canonloop_s1 = "omp.new_cli" () : () -> (!omp.cli) + // CHECK-NEXT: %fused = omp.new_cli + %fused = "omp.new_cli" () : () -> (!omp.cli) + // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc1]]) { + "omp.canonical_loop" (%tc1, %canonloop_s0) ({ + ^bb0(%iv_s0: i32): + // CHECK: omp.terminator + omp.terminator + }) : (i32, !omp.cli) -> () + // CHECK: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc2]]) { + "omp.canonical_loop" (%tc2, %canonloop_s1) ({ + ^bb0(%iv_s1: i32): + // CHECK: omp.terminator + omp.terminator + }) : (i32, !omp.cli) -> () + // CHECK: omp.fuse (%fused) <- (%canonloop_s0, %canonloop_s1) + "omp.fuse"(%fused, %canonloop_s0, %canonloop_s1) <{operandSegmentSizes = array}> : (!omp.cli, !omp.cli, !omp.cli) -> () + return +} + +// Pretty syntax check +// CHECK-LABEL: @omp_fuse_pretty( +// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32) { +func.func @omp_fuse_pretty(%tc1 : i32, %tc2 : i32) -> () { + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %canonloop_s0 = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %canonloop_s1 = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %fused = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc1]]) { + omp.canonical_loop (%canonloop_s0) %iv_s0 : i32 in range(%tc1) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc2]]) { + omp.canonical_loop (%canonloop_s1) %iv_s1 : i32 in range(%tc2) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.fuse (%fused) <- (%canonloop_s0, %canonloop_s1) + omp.fuse(%fused) <- (%canonloop_s0, %canonloop_s1) + return +} + +// Specifying the generatees for omp.fuse is optional +// CHECK-LABEL: @omp_fuse_optionalgen_pretty( +// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32) { +func.func @omp_fuse_optionalgen_pretty(%tc1 : i32, %tc2 : i32) -> () { + // CHECK-NEXT: %canonloop_s0 = omp.new_cli + %canonloop_s0 = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc1]]) { + omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%tc1) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: %canonloop_s1 = omp.new_cli + %canonloop_s1 = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc2]]) { + omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%tc2) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.fuse <- (%canonloop_s0, %canonloop_s1) + omp.fuse <- (%canonloop_s0, %canonloop_s1) + return +} + +// Fuse with looprange attributes +// CHECK-LABEL: @omp_fuse_looprange( +// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32, %[[tc3:.+]]: i32) { +func.func @omp_fuse_looprange(%tc1 : i32, %tc2 : i32, %tc3 : i32) -> () { + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %canonloop_s0 = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %canonloop_s1 = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %canonloop_s2 = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %canonloop_fuse = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %fused = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc1]]) { + omp.canonical_loop (%canonloop_s0) %iv_s0 : i32 in range(%tc1) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc2]]) { + omp.canonical_loop (%canonloop_s1) %iv_s1 : i32 in range(%tc2) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.canonical_loop(%canonloop_s2) %iv_s2 : i32 in range(%[[tc3]]) { + omp.canonical_loop (%canonloop_s2) %iv_s2 : i32 in range(%tc3) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.fuse (%canonloop_fuse, %fused) <- (%canonloop_s0, + // %canonloop_s1, %canonloop_s2) looprange(first = 1, count = 2) + omp.fuse(%fused, %canonloop_fuse) <- (%canonloop_s0, %canonloop_s1, %canonloop_s2) looprange(first = 1, count = 2) + return +} + diff --git a/mlir/test/Dialect/OpenMP/invalid-fuse.mlir b/mlir/test/Dialect/OpenMP/invalid-fuse.mlir new file mode 100644 index 000000000000..ffd1c7300e79 --- /dev/null +++ b/mlir/test/Dialect/OpenMP/invalid-fuse.mlir @@ -0,0 +1,102 @@ +// RUN: mlir-opt -split-input-file -verify-diagnostics %s + + +func.func @no_loops(%tc1 : i32, %tc2 : i32) { + // expected-error@+1 {{'omp.fuse' op must apply to at least two loops}} + omp.fuse <-() + + return +} + +// ----- + +func.func @one_loop(%tc1 : i32, %tc2 : i32) { + %canonloop = omp.new_cli + omp.canonical_loop(%canonloop) %iv : i32 in range(%tc1) { + omp.terminator + } + // expected-error@+1 {{'omp.fuse' op must apply to at least two loops}} + omp.fuse <-(%canonloop) + + return +} + +// ----- + +func.func @missing_generator(%tc1 : i32, %tc2 : i32) { + // expected-error@+1 {{'omp.new_cli' op CLI has no generator}} + %canonloop = omp.new_cli + + // expected-note@+1 {{see consumer here: "omp.fuse"(%0) <{operandSegmentSizes = array}> : (!omp.cli) -> ()}} + omp.fuse <-(%canonloop) + + return +} + +// ----- + +func.func @wrong_generatees1(%tc1 : i32, %tc2 : i32) { + %canonloop1 = omp.new_cli + %canonloop2 = omp.new_cli + omp.canonical_loop(%canonloop1) %iv : i32 in range(%tc1) { + omp.terminator + } + omp.canonical_loop(%canonloop2) %iv : i32 in range(%tc2) { + omp.terminator + } + + %fused1 = omp.new_cli + %fused2 = omp.new_cli + // expected-error@+1 {{'omp.fuse' op in a complete fuse the number of generatees must be exactly 1}} + omp.fuse (%fused1, %fused2) <-(%canonloop1, %canonloop2) + + llvm.return +} + +// ----- + +func.func @wrong_generatees2(%tc1 : i32, %tc2 : i32, %tc3 : i32) { + %canonloop1 = omp.new_cli + %canonloop2 = omp.new_cli + %canonloop3 = omp.new_cli + omp.canonical_loop(%canonloop1) %iv : i32 in range(%tc1) { + omp.terminator + } + omp.canonical_loop(%canonloop2) %iv : i32 in range(%tc2) { + omp.terminator + } + omp.canonical_loop(%canonloop3) %iv : i32 in range(%tc3) { + omp.terminator + } + + %fused = omp.new_cli + // expected-error@+1 {{'omp.fuse' op the number of generatees must be the number of aplyees plus one minus count}} + omp.fuse (%fused) <-(%canonloop1, %canonloop2, %canonloop3) looprange(first = 1, count = 2) + + llvm.return +} + +// ----- + +func.func @wrong_applyees(%tc1 : i32, %tc2 : i32, %tc3 : i32) { + %canonloop1 = omp.new_cli + %canonloop2 = omp.new_cli + %canonloop3 = omp.new_cli + omp.canonical_loop(%canonloop1) %iv : i32 in range(%tc1) { + omp.terminator + } + omp.canonical_loop(%canonloop2) %iv : i32 in range(%tc2) { + omp.terminator + } + omp.canonical_loop(%canonloop3) %iv : i32 in range(%tc3) { + omp.terminator + } + + %fused = omp.new_cli + %canonloop_fuse = omp.new_cli + // expected-error@+1 {{'omp.fuse' op the numbers of applyees must be at least first minus one plus count attributes}} + omp.fuse (%fused, %canonloop_fuse) <-(%canonloop1, %canonloop2, %canonloop3) looprange(first = 1, count = 5) + + llvm.return +} + diff --git a/mlir/test/Target/LLVMIR/openmp-cli-fuse01.mlir b/mlir/test/Target/LLVMIR/openmp-cli-fuse01.mlir new file mode 100644 index 000000000000..0754572b2477 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-cli-fuse01.mlir @@ -0,0 +1,100 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope + + +llvm.func @fuse_trivial_loops(%baseptr: !llvm.ptr, %tc1: i32, %tc2: i32) -> () { + %literal_cli1 = omp.new_cli + omp.canonical_loop(%literal_cli1) %iv1 : i32 in range(%tc1) { + %ptr = llvm.getelementptr inbounds %baseptr[%iv1] : (!llvm.ptr, i32) -> !llvm.ptr, f32 + %val = llvm.mlir.constant(42.0 : f32) : f32 + llvm.store %val, %ptr : f32, !llvm.ptr + omp.terminator + } + %literal_cli2 = omp.new_cli + omp.canonical_loop(%literal_cli2) %iv2 : i32 in range(%tc2) { + %ptr = llvm.getelementptr inbounds %baseptr[%iv2] : (!llvm.ptr, i32) -> !llvm.ptr, f32 + %val = llvm.mlir.constant(21.0 : f32) : f32 + llvm.store %val, %ptr : f32, !llvm.ptr + omp.terminator + } + omp.fuse <- (%literal_cli1, %literal_cli2) + llvm.return +} + +// CHECK-LABEL: define void @fuse_trivial_loops( +// CHECK-SAME: ptr %[[VAL_11:.+]], i32 %[[VAL_5:.+]], i32 %[[VAL_16:.+]]) { +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER1:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER1]]: +// CHECK-NEXT: br label %[[OMP_FUSE_COMP_TC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSE_COMP_TC]]: +// CHECK-NEXT: %[[VAL_15:.+]] = icmp sgt i32 %[[VAL_5:.+]], %[[VAL_16:.+]] +// CHECK-NEXT: %[[VAL_17:.+]] = select i1 %[[VAL_15:.+]], i32 %[[VAL_5:.+]], i32 %[[VAL_16:.+]] +// CHECK-NEXT: br label %[[OMP_FUSED_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_PREHEADER]]: +// CHECK-NEXT: br label %[[OMP_FUSED_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_HEADER]]: +// CHECK-NEXT: %[[VAL_4:.+]] = phi i32 [ 0, %[[VAL_18:.+]] ], [ %[[VAL_27:.+]], %[[VAL_26:.+]] ] +// CHECK-NEXT: br label %[[OMP_FUSED_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_COND]]: +// CHECK-NEXT: %[[VAL_29:.+]] = icmp ult i32 %[[VAL_4:.+]], %[[VAL_17:.+]] +// CHECK-NEXT: br i1 %[[VAL_29:.+]], label %[[OMP_FUSED_BODY:.+]], label %[[OMP_FUSED_EXIT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_BODY]]: +// CHECK-NEXT: br label %[[OMP_FUSED_INNER_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_INNER_COND]]: +// CHECK-NEXT: %[[VAL_3:.+]] = icmp slt i32 %[[VAL_4:.+]], %[[VAL_5:.+]] +// CHECK-NEXT: br i1 %[[VAL_3:.+]], label %[[OMP_OMP_LOOP_BODY:.+]], label %[[OMP_FUSED_INNER_COND13:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_BODY]]: +// CHECK-NEXT: br label %[[OMP_LOOP_REGION:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_LOOP_REGION]]: +// CHECK-NEXT: %[[VAL_10:.+]] = getelementptr inbounds float, ptr %[[VAL_11:.+]], i32 %[[VAL_4:.+]] +// CHECK-NEXT: store float 4.200000e+01, ptr %[[VAL_10:.+]], align 4 +// CHECK-NEXT: br label %[[OMP_REGION_CONT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_REGION_CONT]]: +// CHECK-NEXT: br label %[[OMP_FUSED_INNER_COND13:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_INNER_COND13]]: +// CHECK-NEXT: %[[VAL_19:.+]] = icmp slt i32 %[[VAL_4:.+]], %[[VAL_16:.+]] +// CHECK-NEXT: br i1 %[[VAL_19:.+]], label %[[OMP_OMP_LOOP_BODY4:.+]], label %[[OMP_FUSED_PRE_LATCH:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_BODY4]]: +// CHECK-NEXT: br label %[[OMP_LOOP_REGION12:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_LOOP_REGION12]]: +// CHECK-NEXT: %[[VAL_23:.+]] = getelementptr inbounds float, ptr %[[VAL_11:.+]], i32 %[[VAL_4:.+]] +// CHECK-NEXT: store float 2.100000e+01, ptr %[[VAL_23:.+]], align 4 +// CHECK-NEXT: br label %[[OMP_REGION_CONT11:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_REGION_CONT11]]: +// CHECK-NEXT: br label %[[OMP_FUSED_PRE_LATCH:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_PRE_LATCH]]: +// CHECK-NEXT: br label %[[OMP_FUSED_INC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_INC]]: +// CHECK-NEXT: %[[VAL_27:.+]] = add nuw i32 %[[VAL_4:.+]], 1 +// CHECK-NEXT: br label %[[OMP_FUSED_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_EXIT]]: +// CHECK-NEXT: br label %[[OMP_FUSED_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_AFTER]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER7:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER7]]: +// CHECK-NEXT: ret void + diff --git a/mlir/test/Target/LLVMIR/openmp-cli-fuse02.mlir b/mlir/test/Target/LLVMIR/openmp-cli-fuse02.mlir new file mode 100644 index 000000000000..298cf0c50035 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-cli-fuse02.mlir @@ -0,0 +1,140 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope + + +llvm.func @fuse_looprange_loops(%baseptr: !llvm.ptr, %tc1: i32, %tc2: i32, %tc3: i32) -> () { + %literal_cli1 = omp.new_cli + omp.canonical_loop(%literal_cli1) %iv1 : i32 in range(%tc1) { + %ptr = llvm.getelementptr inbounds %baseptr[%iv1] : (!llvm.ptr, i32) -> !llvm.ptr, f32 + %val = llvm.mlir.constant(42.0 : f32) : f32 + llvm.store %val, %ptr : f32, !llvm.ptr + omp.terminator + } + %literal_cli2 = omp.new_cli + omp.canonical_loop(%literal_cli2) %iv2 : i32 in range(%tc2) { + %ptr = llvm.getelementptr inbounds %baseptr[%iv2] : (!llvm.ptr, i32) -> !llvm.ptr, f32 + %val = llvm.mlir.constant(21.0 : f32) : f32 + llvm.store %val, %ptr : f32, !llvm.ptr + omp.terminator + } + %literal_cli3 = omp.new_cli + omp.canonical_loop(%literal_cli3) %iv3 : i32 in range(%tc3) { + %ptr = llvm.getelementptr inbounds %baseptr[%iv3] : (!llvm.ptr, i32) -> !llvm.ptr, f32 + %val = llvm.mlir.constant(63.0 : f32) : f32 + llvm.store %val, %ptr : f32, !llvm.ptr + omp.terminator + } + omp.fuse <- (%literal_cli1, %literal_cli2, %literal_cli3) looprange(first = 1, count = 2) + llvm.return +} + + +// CHECK-LABEL: define void @fuse_looprange_loops( +// CHECK-SAME: ptr %[[VAL_23:.+]], i32 %[[VAL_5:.+]], i32 %[[VAL_6:.+]], i32 %[[VAL_40:.+]]) { +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER1:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER1]]: +// CHECK-NEXT: br label %[[OMP_FUSE_COMP_TC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSE_COMP_TC]]: +// CHECK-NEXT: %[[VAL_4:.+]] = icmp sgt i32 %[[VAL_5:.+]], %[[VAL_6:.+]] +// CHECK-NEXT: %[[VAL_7:.+]] = select i1 %[[VAL_4:.+]], i32 %[[VAL_5:.+]], i32 %[[VAL_6:.+]] +// CHECK-NEXT: br label %[[OMP_FUSED_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_PREHEADER]]: +// CHECK-NEXT: br label %[[OMP_FUSED_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_HEADER]]: +// CHECK-NEXT: %[[VAL_11:.+]] = phi i32 [ 0, %[[VAL_8:.+]] ], [ %[[VAL_12:.+]], %[[VAL_10:.+]] ] +// CHECK-NEXT: br label %[[OMP_FUSED_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_COND]]: +// CHECK-NEXT: %[[VAL_14:.+]] = icmp ult i32 %[[VAL_11:.+]], %[[VAL_7:.+]] +// CHECK-NEXT: br i1 %[[VAL_14:.+]], label %[[OMP_FUSED_BODY:.+]], label %[[OMP_FUSED_EXIT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_BODY]]: +// CHECK-NEXT: br label %[[OMP_FUSED_INNER_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_INNER_COND]]: +// CHECK-NEXT: %[[VAL_18:.+]] = icmp slt i32 %[[VAL_11:.+]], %[[VAL_5:.+]] +// CHECK-NEXT: br i1 %[[VAL_18:.+]], label %[[OMP_OMP_LOOP_BODY:.+]], label %[[OMP_FUSED_INNER_COND25:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_BODY]]: +// CHECK-NEXT: br label %[[OMP_LOOP_REGION:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_LOOP_REGION]]: +// CHECK-NEXT: %[[VAL_22:.+]] = getelementptr inbounds float, ptr %[[VAL_23:.+]], i32 %[[VAL_11:.+]] +// CHECK-NEXT: store float 4.200000e+01, ptr %[[VAL_22:.+]], align 4 +// CHECK-NEXT: br label %[[OMP_REGION_CONT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_REGION_CONT]]: +// CHECK-NEXT: br label %[[OMP_FUSED_INNER_COND25:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_INNER_COND25]]: +// CHECK-NEXT: %[[VAL_25:.+]] = icmp slt i32 %[[VAL_11:.+]], %[[VAL_6:.+]] +// CHECK-NEXT: br i1 %[[VAL_25:.+]], label %[[OMP_OMP_LOOP_BODY4:.+]], label %[[OMP_FUSED_PRE_LATCH:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_BODY4]]: +// CHECK-NEXT: br label %[[OMP_LOOP_REGION12:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_LOOP_REGION12]]: +// CHECK-NEXT: %[[VAL_29:.+]] = getelementptr inbounds float, ptr %[[VAL_23:.+]], i32 %[[VAL_11:.+]] +// CHECK-NEXT: store float 2.100000e+01, ptr %[[VAL_29:.+]], align 4 +// CHECK-NEXT: br label %[[OMP_REGION_CONT11:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_REGION_CONT11]]: +// CHECK-NEXT: br label %[[OMP_FUSED_PRE_LATCH:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_PRE_LATCH]]: +// CHECK-NEXT: br label %[[OMP_FUSED_INC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_INC]]: +// CHECK-NEXT: %[[VAL_12:.+]] = add nuw i32 %[[VAL_11:.+]], 1 +// CHECK-NEXT: br label %[[OMP_FUSED_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_EXIT]]: +// CHECK-NEXT: br label %[[OMP_FUSED_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_AFTER]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER7:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER7]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER13:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER13]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_HEADER14:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_HEADER14]]: +// CHECK-NEXT: %[[VAL_36:.+]] = phi i32 [ 0, %[[VAL_33:.+]] ], [ %[[VAL_37:.+]], %[[VAL_35:.+]] ] +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_COND15:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_COND15]]: +// CHECK-NEXT: %[[VAL_39:.+]] = icmp ult i32 %[[VAL_36:.+]], %[[VAL_40:.+]] +// CHECK-NEXT: br i1 %[[VAL_39:.+]], label %[[OMP_OMP_LOOP_BODY16:.+]], label %[[OMP_OMP_LOOP_EXIT18:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_BODY16]]: +// CHECK-NEXT: br label %[[OMP_LOOP_REGION24:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_LOOP_REGION24]]: +// CHECK-NEXT: %[[VAL_44:.+]] = getelementptr inbounds float, ptr %[[VAL_23:.+]], i32 %[[VAL_36:.+]] +// CHECK-NEXT: store float 6.300000e+01, ptr %[[VAL_44:.+]], align 4 +// CHECK-NEXT: br label %[[OMP_REGION_CONT23:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_REGION_CONT23]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_INC17:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_INC17]]: +// CHECK-NEXT: %[[VAL_37:.+]] = add nuw i32 %[[VAL_36:.+]], 1 +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_HEADER14:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_EXIT18]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER19:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER19]]: +// CHECK-NEXT: ret void + diff --git a/openmp/runtime/test/transform/fuse/do-looprange.f90 b/openmp/runtime/test/transform/fuse/do-looprange.f90 new file mode 100644 index 000000000000..8c62b24c4744 --- /dev/null +++ b/openmp/runtime/test/transform/fuse/do-looprange.f90 @@ -0,0 +1,60 @@ +! RUN: %flang %flags %openmp_flags -fopenmp-version=60 %s -o %t.exe +! RUN: %t.exe | FileCheck %s --match-full-lines + +program fuse_full + implicit none + integer i, j, k, u + + print *, 'do' + + !$OMP FUSE LOOPRANGE(2,2) + do i=5, 25, 5 + print '("i=", I0)', i + end do + do j=10, 100, 10 + print '("j=", I0)', j + end do + do k=10, 0, -1 + print '("k=", I0)', k + end do + do u=5, 25, 5 + print '("u=", I0)', u + end do + !$OMP END FUSE + + print *, 'done' +end program + +! CHECK: do +! CHECK-NEXT: i=5 +! CHECK-NEXT: i=10 +! CHECK-NEXT: i=15 +! CHECK-NEXT: i=20 +! CHECK-NEXT: i=25 +! CHECK-NEXT: j=10 +! CHECK-NEXT: k=10 +! CHECK-NEXT: j=20 +! CHECK-NEXT: k=9 +! CHECK-NEXT: j=30 +! CHECK-NEXT: k=8 +! CHECK-NEXT: j=40 +! CHECK-NEXT: k=7 +! CHECK-NEXT: j=50 +! CHECK-NEXT: k=6 +! CHECK-NEXT: j=60 +! CHECK-NEXT: k=5 +! CHECK-NEXT: j=70 +! CHECK-NEXT: k=4 +! CHECK-NEXT: j=80 +! CHECK-NEXT: k=3 +! CHECK-NEXT: j=90 +! CHECK-NEXT: k=2 +! CHECK-NEXT: j=100 +! CHECK-NEXT: k=1 +! CHECK-NEXT: k=0 +! CHECK-NEXT: u=5 +! CHECK-NEXT: u=10 +! CHECK-NEXT: u=15 +! CHECK-NEXT: u=20 +! CHECK-NEXT: u=25 +! CHECK-NEXT: done diff --git a/openmp/runtime/test/transform/fuse/do.f90 b/openmp/runtime/test/transform/fuse/do.f90 new file mode 100644 index 000000000000..d4496bce4d72 --- /dev/null +++ b/openmp/runtime/test/transform/fuse/do.f90 @@ -0,0 +1,52 @@ +! RUN: %flang %flags %openmp_flags -fopenmp-version=60 %s -o %t.exe +! RUN: %t.exe | FileCheck %s --match-full-lines + +program fuse_full + implicit none + integer i, j, k + + print *, 'do' + + !$OMP FUSE + do i=5, 25, 5 + print '("i=", I0)', i + end do + do j=10, 100, 10 + print '("j=", I0)', j + end do + do k=10, 0, -1 + print '("k=", I0)', k + end do + !$OMP END FUSE + + print *, 'done' +end program + +! CHECK: do +! CHECK-NEXT: i=5 +! CHECK-NEXT: j=10 +! CHECK-NEXT: k=10 +! CHECK-NEXT: i=10 +! CHECK-NEXT: j=20 +! CHECK-NEXT: k=9 +! CHECK-NEXT: i=15 +! CHECK-NEXT: j=30 +! CHECK-NEXT: k=8 +! CHECK-NEXT: i=20 +! CHECK-NEXT: j=40 +! CHECK-NEXT: k=7 +! CHECK-NEXT: i=25 +! CHECK-NEXT: j=50 +! CHECK-NEXT: k=6 +! CHECK-NEXT: j=60 +! CHECK-NEXT: k=5 +! CHECK-NEXT: j=70 +! CHECK-NEXT: k=4 +! CHECK-NEXT: j=80 +! CHECK-NEXT: k=3 +! CHECK-NEXT: j=90 +! CHECK-NEXT: k=2 +! CHECK-NEXT: j=100 +! CHECK-NEXT: k=1 +! CHECK-NEXT: k=0 +! CHECK-NEXT: done