[mlir][acc] Add ACCRecipeMaterialization pass and reduction ops (#184252)

Pass
----
Add the `acc-recipe-materialization` pass, which materializes OpenACC
privatization, firstprivate and reduction recipes by inlining their
init, copy, combiner, and destroy regions into the operation for the
construct. The pass runs on acc.parallel, acc.serial, acc.kernels, and
acc.loop.

- Firstprivate: Inserts acc.firstprivate_map so the initial value is
available on the device, then clones the recipe init and copy regions
into the construct and replaces uses with the materialized alloca.
Optional destroy region is cloned before the region terminator.

- Private: Clones the recipe init region into the construct (at region
entry or at the loop op for acc.loop private). Replaces uses of the
recipe result with the materialized alloca. Optional destroy region is
cloned before the region terminator.

- Reduction: Creates acc.reduction_init (init region inlined) and
acc.reduction_combine_region (combiner region inlined). All uses of the
reduction in the region are updated to the reduction init result.

New operations
--------------
- acc.reduction_init: Allocates and initializes a private reduction
variable from a recipe. Takes the original reduction variable and
reduction_operator; has a single region that must yield one value (the
private storage) via acc.yield. Used by the pass to materialize
acc.reduction_recipe init regions inside the compute construct.

- acc.reduction_combine_region: Combines the private reduction value
with the shared reduction variable. Takes the shared and private
memrefs; has a single region (the recipe combiner) terminated by
acc.yield with no operands. Used by the pass to materialize the
reduction recipe combiner.

Both ops implement RegionBranchOpInterface. acc.yield is updated to
allow terminating ReductionInitOp and ReductionCombineRegionOp regions.

Supporting changes
------------------
- OpenACCUtilsLoop: Factor cloneACCRegionInto out of the existing
loop-conversion helper so the pass can clone recipe regions with
optional result replacement; loop conversion now calls the shared
helper.
- Flang: Add ReductionInitOpFortranObjectViewModel
(FortranObjectViewOpInterface) for acc.reduction_init and register it in
OpenACC extensions.

Tests
-----
- MLIR: acc-recipe-materialization-{firstprivate,private,reduction,
kernel-private,parallel}.mlir (memref dialect).
- Flang: acc-recipe-materialization-{firstprivate,firstprivate-derived,
private,reduction,kernel-private,parallel}.fir; firstprivate test has a
second RUN with -acc-optimize-firstprivate-map.

---------

Co-authored-by: Scott Manley <rscottmanley@gmail.com>
This commit is contained in:
Razvan Lupusoru 2026-03-02 17:35:22 -08:00 committed by GitHub
parent 92aa2d36f0
commit e63e55cae8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 1329 additions and 41 deletions

View File

@ -14,6 +14,7 @@
#define FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_
#include "flang/Optimizer/Dialect/FIROperationMoveOpInterface.h"
#include "flang/Optimizer/Dialect/FortranVariableInterface.h"
#include "mlir/Dialect/OpenACC/OpenACC.h"
namespace fir {
@ -121,6 +122,15 @@ struct OperationMoveModel : public fir::OperationMoveOpInterface::ExternalModel<
bool canMoveOutOf(mlir::Operation *op, mlir::Operation *candidate) const;
};
struct ReductionInitOpFortranObjectViewModel
: public fir::FortranObjectViewOpInterface::ExternalModel<
ReductionInitOpFortranObjectViewModel, mlir::acc::ReductionInitOp> {
mlir::Value getViewSource(mlir::Operation *op,
mlir::OpResult resultView) const;
std::optional<std::int64_t> getViewOffset(mlir::Operation *op,
mlir::OpResult resultView) const;
};
} // namespace fir::acc
#endif // FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_

View File

@ -17,10 +17,41 @@
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "flang/Optimizer/Support/InternalNames.h"
#include "mlir/IR/SymbolTable.h"
#include "mlir/Interfaces/ControlFlowInterfaces.h"
#include "llvm/ADT/SmallSet.h"
namespace fir::acc {
mlir::Value ReductionInitOpFortranObjectViewModel::getViewSource(
mlir::Operation *op, mlir::OpResult resultView) const {
assert(resultView.getOwner() == op && "result value must be the op's result");
assert(op->getNumResults() == 1 &&
"definition of acc.reduction_init changed");
auto iface = mlir::cast<mlir::RegionBranchOpInterface>(op);
llvm::SmallVector<mlir::Value, 1> resultValues;
iface.getPredecessorValues(mlir::RegionSuccessor::parent(), /*index=*/0,
resultValues);
assert(!resultValues.empty() &&
"acc.reduction_init's result must have at least one possible value");
mlir::Value passThroughValue;
for (mlir::Value v : resultValues) {
if (!passThroughValue) {
passThroughValue = v;
continue;
}
assert(passThroughValue == v &&
"acc.reduction_init must return the same allocation");
}
return passThroughValue;
}
std::optional<std::int64_t>
ReductionInitOpFortranObjectViewModel::getViewOffset(
mlir::Operation *op, mlir::OpResult resultView) const {
assert(resultView.getOwner() == op && "result value must be the op's result");
return 0;
}
template <>
mlir::Value PartialEntityAccessModel<fir::ArrayCoorOp>::getBaseEntity(
mlir::Operation *op) const {

View File

@ -98,6 +98,8 @@ void registerOpenACCExtensions(mlir::DialectRegistry &registry) {
mlir::acc::OpenACCDialect *dialect) {
mlir::acc::LoopOp::attachInterface<OperationMoveModel<mlir::acc::LoopOp>>(
*ctx);
mlir::acc::ReductionInitOp::attachInterface<
fir::acc::ReductionInitOpFortranObjectViewModel>(*ctx);
});
registerAttrsExtensions(registry);

View File

@ -0,0 +1,60 @@
// RUN: fir-opt %s -acc-recipe-materialization | FileCheck %s
module {
acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
^bb0(%arg0: !fir.ref<i32>):
%0 = fir.alloca i32
acc.yield %0 : !fir.ref<i32>
}
acc.firstprivate.recipe @firstprivatization_ref_rec__QFtestTpoint : !fir.ref<!fir.type<_QFtestTpoint{x:f32}>> init {
^bb0(%arg0: !fir.ref<!fir.type<_QFtestTpoint{x:f32}>>):
%0 = fir.alloca !fir.type<_QFtestTpoint{x:f32}>
%1 = fir.declare %0 {uniq_name = "acc.private.init"} : (!fir.ref<!fir.type<_QFtestTpoint{x:f32}>>) -> !fir.ref<!fir.type<_QFtestTpoint{x:f32}>>
acc.yield %1 : !fir.ref<!fir.type<_QFtestTpoint{x:f32}>>
} copy {
^bb0(%arg0: !fir.ref<!fir.type<_QFtestTpoint{x:f32}>>, %arg1: !fir.ref<!fir.type<_QFtestTpoint{x:f32}>>):
%0 = fir.field_index x, !fir.type<_QFtestTpoint{x:f32}>
%1 = fir.coordinate_of %arg0, x : (!fir.ref<!fir.type<_QFtestTpoint{x:f32}>>) -> !fir.ref<f32>
%2 = fir.field_index x, !fir.type<_QFtestTpoint{x:f32}>
%3 = fir.coordinate_of %arg1, x : (!fir.ref<!fir.type<_QFtestTpoint{x:f32}>>) -> !fir.ref<f32>
%4 = fir.load %1 : !fir.ref<f32>
fir.store %4 to %3 : !fir.ref<f32>
acc.terminator
}
func.func @_QPtest(%arg0: !fir.ref<!fir.type<_QFtestTpoint{x:f32}>> {fir.bindc_name = "p"}) {
%c1_i32 = arith.constant 1 : i32
%0 = fir.dummy_scope : !fir.dscope
%1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtestEi"}
%2 = fir.declare %1 {uniq_name = "_QFtestEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
%3 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFtestEn"}
%4 = fir.declare %3 {uniq_name = "_QFtestEn"} : (!fir.ref<i32>) -> !fir.ref<i32>
%5 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFtestEp"} : (!fir.ref<!fir.type<_QFtestTpoint{x:f32}>>, !fir.dscope) -> !fir.ref<!fir.type<_QFtestTpoint{x:f32}>>
%6 = acc.firstprivate varPtr(%5 : !fir.ref<!fir.type<_QFtestTpoint{x:f32}>>) recipe(@firstprivatization_ref_rec__QFtestTpoint) -> !fir.ref<!fir.type<_QFtestTpoint{x:f32}>> {name = "p"}
acc.parallel firstprivate(%6 : !fir.ref<!fir.type<_QFtestTpoint{x:f32}>>) {
%7 = fir.load %4 : !fir.ref<i32>
%8 = acc.private varPtr(%2 : !fir.ref<i32>) recipe(@privatization_ref_i32) -> !fir.ref<i32> {implicit = true, name = "i"}
acc.loop private(%8 : !fir.ref<i32>) control(%arg1 : i32) = (%c1_i32 : i32) to (%7 : i32) step (%c1_i32 : i32) {
%9 = fir.declare %8 {uniq_name = "_QFtestEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
fir.store %arg1 to %9 : !fir.ref<i32>
%10 = fir.load %9 : !fir.ref<i32>
%11 = fir.convert %10 : (i32) -> f32
%12 = fir.field_index x, !fir.type<_QFtestTpoint{x:f32}>
%13 = fir.coordinate_of %5, x : (!fir.ref<!fir.type<_QFtestTpoint{x:f32}>>) -> !fir.ref<f32>
fir.store %11 to %13 : !fir.ref<f32>
acc.yield
} attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
acc.yield
}
return
}
}
// CHECK: %[[VAL_7:.*]] = acc.firstprivate_map varPtr(%{{.*}} : !fir.ref<!fir.type<_QFtestTpoint{x:f32}>>) -> !fir.ref<!fir.type<_QFtestTpoint{x:f32}>>
// CHECK: acc.parallel {
// CHECK: %[[VAL_8:.*]] = fir.alloca !fir.type<_QFtestTpoint{x:f32}>
// CHECK: %[[VAL_9:.*]] = fir.declare %[[VAL_8]] {acc.var_name = #acc.var_name<"p">, uniq_name = "acc.private.init"} : (!fir.ref<!fir.type<_QFtestTpoint{x:f32}>>) -> !fir.ref<!fir.type<_QFtestTpoint{x:f32}>>
// CHECK: %[[VAL_10:.*]] = fir.field_index x, !fir.type<_QFtestTpoint{x:f32}>
// CHECK: %[[VAL_11:.*]] = fir.coordinate_of %[[VAL_7]], x : (!fir.ref<!fir.type<_QFtestTpoint{x:f32}>>) -> !fir.ref<f32>
// CHECK: %[[VAL_12:.*]] = fir.field_index x, !fir.type<_QFtestTpoint{x:f32}>
// CHECK: %[[VAL_13:.*]] = fir.coordinate_of %[[VAL_9]], x : (!fir.ref<!fir.type<_QFtestTpoint{x:f32}>>) -> !fir.ref<f32>
// CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_11]] : !fir.ref<f32>

View File

@ -0,0 +1,56 @@
// RUN: fir-opt %s -acc-recipe-materialization | FileCheck %s
// RUN: fir-opt %s -acc-recipe-materialization -acc-optimize-firstprivate-map | FileCheck %s --check-prefix=CHECK-MAP
module {
acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
^bb0(%arg0: !fir.ref<i32>):
%0 = fir.alloca i32
acc.yield %0 : !fir.ref<i32>
}
acc.firstprivate.recipe @firstprivatization_ref_i32 : !fir.ref<i32> init {
^bb0(%arg0: !fir.ref<i32>):
%0 = fir.alloca i32
acc.yield %0 : !fir.ref<i32>
} copy {
^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
%0 = fir.load %arg0 : !fir.ref<i32>
fir.store %0 to %arg1 : !fir.ref<i32>
acc.terminator
}
func.func @firstpriv() {
%c1336_i32 = arith.constant 1336 : i32
%0 = fir.dummy_scope : !fir.dscope
%1 = fir.alloca i32 {bindc_name = "t", uniq_name = "_QFfirstprivEt"}
%2 = fir.declare %1 {uniq_name = "_QFfirstprivEt"} : (!fir.ref<i32>) -> !fir.ref<i32>
fir.store %c1336_i32 to %2 : !fir.ref<i32>
%3 = acc.firstprivate varPtr(%2 : !fir.ref<i32>) recipe(@firstprivatization_ref_i32) -> !fir.ref<i32> {implicit = true, name = "t"}
acc.parallel firstprivate(%3 : !fir.ref<i32>) {
%c1_i32 = arith.constant 1 : i32
%4 = fir.load %3 : !fir.ref<i32>
%5 = arith.addi %4, %c1_i32 : i32
fir.store %5 to %3 : !fir.ref<i32>
acc.yield
}
return
}
}
// Verify that the firstprivate was materialized into a copy outside the kernel
// and an alloca (as per the recipe) inside the region.
// Then ensure that all uses are of the private alloca.
// CHECK-LABEL: func.func @firstpriv
// CHECK: acc.parallel
// CHECK: %[[ALLOCA:.*]] = fir.alloca i32 {{.*}}acc.var_name = #acc.var_name<"t">
// CHECK: %[[FIRSTPRIVLOAD:.*]] = fir.load %{{.*}} : !fir.ref<i32>
// CHECK: fir.store %[[FIRSTPRIVLOAD]] to %[[ALLOCA]] : !fir.ref<i32>
// CHECK: %[[ALLOCALOAD:.*]] = fir.load %[[ALLOCA]] : !fir.ref<i32>
// CHECK: %[[ADDI:.*]] = arith.addi %[[ALLOCALOAD]], %c1{{.*}} : i32
// CHECK: fir.store %[[ADDI]] to %[[ALLOCA]] : !fir.ref<i32>
// CHECK-MAP-LABEL: func.func @firstpriv
// CHECK-MAP: fir.load {{.*}} : !fir.ref<i32>
// CHECK-MAP: acc.parallel {
// CHECK-MAP-NOT: acc.firstprivate_map
// CHECK-MAP: fir.alloca i32 {{.*}}acc.var_name = #acc.var_name<"t">
// CHECK-MAP: fir.store {{.*}} to {{.*}} : !fir.ref<i32>
// CHECK-MAP: arith.addi {{.*}} %c1

View File

@ -0,0 +1,45 @@
// RUN: fir-opt %s -acc-recipe-materialization | FileCheck %s
// acc.kernels with private: recipe materialized to alloca inside region
// CHECK-NOT: acc.private
// CHECK: fir.alloca i32 {{.*}}acc.var_name = #acc.var_name<"s">
acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
^bb0(%arg0: !fir.ref<i32>):
%0 = fir.alloca i32
acc.yield %0 : !fir.ref<i32>
}
func.func @kpriv_(%arg0: !fir.ref<i32> {fir.bindc_name = "start"}, %arg1: !fir.ref<!fir.array<32xi32>> {fir.bindc_name = "a"}) attributes {fir.internal_name = "_QPkpriv"} {
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%0 = fir.dummy_scope : !fir.dscope
%1 = fir.shape %c32 : (index) -> !fir.shape<1>
%2 = fir.declare %arg1(%1) dummy_scope %0 arg 2 {uniq_name = "_QFkprivEa"} : (!fir.ref<!fir.array<32xi32>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<32xi32>>
%3 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFkprivEi"}
%4 = fir.declare %3 {uniq_name = "_QFkprivEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
%5 = fir.alloca i32 {bindc_name = "s", uniq_name = "_QFkprivEs"}
%6 = fir.declare %5 {uniq_name = "_QFkprivEs"} : (!fir.ref<i32>) -> !fir.ref<i32>
%7 = fir.declare %arg0 dummy_scope %0 arg 1 {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFkprivEstart"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
%8 = fir.load %7 : !fir.ref<i32>
fir.store %8 to %6 : !fir.ref<i32>
%9 = acc.copyin varPtr(%2 : !fir.ref<!fir.array<32xi32>>) -> !fir.ref<!fir.array<32xi32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "a"}
%10 = acc.private varPtr(%6 : !fir.ref<i32>) recipe(@privatization_ref_i32) -> !fir.ref<i32> {implicit = true, name = "s"}
acc.kernels dataOperands(%9 : !fir.ref<!fir.array<32xi32>>) private(%10 : !fir.ref<i32>) {
%11 = fir.shape %c32 : (index) -> !fir.shape<1>
acc.loop control(%arg2 : index) = (%c1 : index) to (%c32 : index) step (%c1 : index) {
%12 = fir.alloca i32 {bindc_name = "i"}
%13 = fir.declare %12 {uniq_name = "_QFkprivEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
%14 = fir.convert %arg2 : (index) -> i32
fir.store %14 to %13 : !fir.ref<i32>
%15 = fir.load %10 : !fir.ref<i32>
%16 = fir.load %13 : !fir.ref<i32>
%17 = fir.convert %16 : (i32) -> i64
%18 = fir.array_coor %9(%11) %17 : (!fir.ref<!fir.array<32xi32>>, !fir.shape<1>, i64) -> !fir.ref<i32>
fir.store %15 to %18 : !fir.ref<i32>
acc.yield
} attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
acc.terminator
}
acc.copyout accPtr(%9 : !fir.ref<!fir.array<32xi32>>) to varPtr(%2 : !fir.ref<!fir.array<32xi32>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "a"}
return
}

View File

@ -0,0 +1,50 @@
// RUN: fir-opt %s -acc-recipe-materialization | FileCheck %s
// Test that the reduction recipes are correctly inlined when attached to a
// parallel construct without loop. Verify init and combine materialize in the region.
// CHECK-LABEL: func.func @par_reduction_clause_
// CHECK: acc.parallel {
// CHECK: [[PRIVATE:%.*]] = acc.reduction_init {{.*}} <add>
// CHECK-NEXT: [[ZERO:%.*]] = arith.constant 0.000000e+00 : f64
// CHECK-NEXT: [[ALLOCA:%.*]] = fir.alloca f64
// CHECK-NEXT: {{.*}} = fir.declare [[ALLOCA]] {{.*}}acc.reduction.init
// CHECK-NEXT: fir.store [[ZERO]] to {{.*}}
// CHECK-NEXT: acc.yield {{.*}}
// CHECK: } {{.*}}acc.var_name = #acc.var_name<"tmp">
// CHECK: fir.load [[PRIVATE]]
// CHECK: fir.store {{.*}} to [[PRIVATE]]
// CHECK: acc.reduction_combine_region [[PRIVATE]] into [[REDUCVAR:%.*]] :
// CHECK: [[LOADVAR:%.*]] = fir.load [[REDUCVAR]]
// CHECK-NEXT: [[LOADPRIV:%.*]] = fir.load [[PRIVATE]]
// CHECK-NEXT: [[COMBINE:%.*]] = arith.addf [[LOADVAR]], [[LOADPRIV]]
// CHECK-NEXT: fir.store [[COMBINE]] to [[REDUCVAR]]
// CHECK: acc.yield
acc.reduction.recipe @reduction_add_ref_f64 : !fir.ref<f64> reduction_operator <add> init {
^bb0(%arg0: !fir.ref<f64>):
%cst = arith.constant 0.000000e+00 : f64
%0 = fir.alloca f64
%1 = fir.declare %0 {uniq_name = "acc.reduction.init"} : (!fir.ref<f64>) -> !fir.ref<f64>
fir.store %cst to %1 : !fir.ref<f64>
acc.yield %1 : !fir.ref<f64>
} combiner {
^bb0(%arg0: !fir.ref<f64>, %arg1: !fir.ref<f64>):
%0 = fir.load %arg0 : !fir.ref<f64>
%1 = fir.load %arg1 : !fir.ref<f64>
%2 = arith.addf %0, %1 fastmath<contract> : f64
fir.store %2 to %arg0 : !fir.ref<f64>
acc.yield %arg0 : !fir.ref<f64>
}
func.func @par_reduction_clause_(%arg0: !fir.ref<f64> {fir.bindc_name = "tmp"}) attributes {fir.internal_name = "_QPpar_reduction_clause"} {
%cst = arith.constant 1.000000e+00 : f64
%0 = fir.dummy_scope : !fir.dscope
%1 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFpar_reduction_clauseEtmp"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64>
%2 = acc.reduction varPtr(%1 : !fir.ref<f64>) recipe(@reduction_add_ref_f64) -> !fir.ref<f64> {name = "tmp"}
acc.parallel reduction(%2 : !fir.ref<f64>) {
%3 = fir.load %2 : !fir.ref<f64>
%4 = arith.addf %3, %cst fastmath<contract> : f64
fir.store %4 to %2 : !fir.ref<f64>
acc.yield
}
return
}

View File

@ -0,0 +1,47 @@
// RUN: fir-opt %s -acc-recipe-materialization | FileCheck %s
acc.private.recipe @privatization_ref_i64 : !fir.ref<i64> init {
^bb0(%arg0: !fir.ref<i64>):
%0 = fir.alloca i64
acc.yield %0 : !fir.ref<i64>
}
// CHECK-LABEL: func.func @private_i64
// CHECK: acc.loop control([[IV:%.+]] : i64)
// CHECK: [[ALLOC:%.+]] = fir.alloca i64
// CHECK: [[DECL:%.+]] = fir.declare [[ALLOC]] {uniq_name = "_private_arg0"}
// CHECK: fir.store [[IV]] to [[DECL]]
func.func @private_i64(%arg0 : !fir.ref<i64>) {
%c16_i32 = arith.constant 16 : i32
%c1_i32 = arith.constant 1 : i32
%priv = acc.private varPtr(%arg0 : !fir.ref<i64>) recipe(@privatization_ref_i64) -> !fir.ref<i64> {implicit = true, name = ""}
acc.loop private(%priv : !fir.ref<i64>) control(%siv : i64) = (%c1_i32 : i32) to (%c16_i32 : i32) step (%c1_i32 : i32) {
%priv_decl = fir.declare %priv {uniq_name = "_private_arg0"} : (!fir.ref<i64>) -> !fir.ref<i64>
fir.store %siv to %priv_decl : !fir.ref<i64>
acc.yield
} attributes {independent = [#acc.device_type<none>]}
return
}
// CHECK-LABEL: func.func @par_private_i64
// CHECK: acc.parallel {
// CHECK: [[ALLOC:%.+]] = fir.alloca i64
// CHECK: [[DECL:%.+]] = fir.declare [[ALLOC]] {uniq_name = "_private_arg0"}
// CHECK: acc.loop control([[IV:%.+]] : i64)
// CHECK: fir.store [[IV]] to [[DECL]]
func.func @par_private_i64(%arg0 : !fir.ref<i64>) {
%c16_i32 = arith.constant 16 : i32
%c1_i32 = arith.constant 1 : i32
%priv = acc.private varPtr(%arg0 : !fir.ref<i64>) recipe(@privatization_ref_i64) -> !fir.ref<i64> {implicit = true, name = ""}
acc.parallel private(%priv : !fir.ref<i64>) {
%priv_decl = fir.declare %priv {uniq_name = "_private_arg0"} : (!fir.ref<i64>) -> !fir.ref<i64>
acc.loop control(%siv : i64) = (%c1_i32 : i32) to (%c16_i32 : i32) step (%c1_i32 : i32) {
fir.store %siv to %priv_decl : !fir.ref<i64>
acc.yield
} attributes {independent = [#acc.device_type<none>]}
acc.yield
}
return
}

View File

@ -0,0 +1,50 @@
// RUN: fir-opt %s -acc-recipe-materialization | FileCheck %s
// Verify that the reduction init and combine recipes attached to compute
// ops materialize within the region
// CHECK-LABEL: func.func @par_reduction_clause_
// CHECK: acc.parallel {
// CHECK: [[PRIVATE:%.*]] = acc.reduction_init {{.*}} <add>
// CHECK-NEXT: [[ZERO:%.*]] = arith.constant 0.000000e+00 : f64
// CHECK-NEXT: [[ALLOCA:%.*]] = fir.alloca f64
// CHECK-NEXT: {{.*}} = fir.declare [[ALLOCA]] {{.*}}acc.reduction.init
// CHECK-NEXT: fir.store [[ZERO]] to {{.*}}
// CHECK-NEXT: acc.yield {{.*}}
// CHECK: } {{.*}}acc.var_name = #acc.var_name<"tmp">
// CHECK: fir.load [[PRIVATE]]
// CHECK: fir.store {{.*}} to [[PRIVATE]]
// CHECK: acc.reduction_combine_region [[PRIVATE]] into [[REDUCVAR:%.*]] :
// CHECK: [[LOADVAR:%.*]] = fir.load [[REDUCVAR]]
// CHECK-NEXT: [[LOADPRIV:%.*]] = fir.load [[PRIVATE]]
// CHECK-NEXT: [[COMBINE:%.*]] = arith.addf [[LOADVAR]], [[LOADPRIV]]
// CHECK-NEXT: fir.store [[COMBINE]] to [[REDUCVAR]]
// CHECK: acc.yield
acc.reduction.recipe @reduction_add_ref_f64 : !fir.ref<f64> reduction_operator <add> init {
^bb0(%arg0: !fir.ref<f64>):
%cst = arith.constant 0.000000e+00 : f64
%0 = fir.alloca f64
%1 = fir.declare %0 {uniq_name = "acc.reduction.init"} : (!fir.ref<f64>) -> !fir.ref<f64>
fir.store %cst to %1 : !fir.ref<f64>
acc.yield %1 : !fir.ref<f64>
} combiner {
^bb0(%arg0: !fir.ref<f64>, %arg1: !fir.ref<f64>):
%0 = fir.load %arg0 : !fir.ref<f64>
%1 = fir.load %arg1 : !fir.ref<f64>
%2 = arith.addf %0, %1 fastmath<contract> : f64
fir.store %2 to %arg0 : !fir.ref<f64>
acc.yield %arg0 : !fir.ref<f64>
}
func.func @par_reduction_clause_(%arg0: !fir.ref<f64> {fir.bindc_name = "tmp"}) attributes {fir.internal_name = "_QPpar_reduction_clause"} {
%cst = arith.constant 1.000000e+00 : f64
%0 = fir.dummy_scope : !fir.dscope
%1 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFpar_reduction_clauseEtmp"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64>
%2 = acc.reduction varPtr(%1 : !fir.ref<f64>) recipe(@reduction_add_ref_f64) -> !fir.ref<f64> {name = "tmp"}
acc.parallel reduction(%2 : !fir.ref<f64>) {
%3 = fir.load %2 : !fir.ref<f64>
%4 = arith.addf %3, %cst fastmath<contract> : f64
fir.store %4 to %2 : !fir.ref<f64>
acc.yield
}
return
}

View File

@ -16,10 +16,76 @@
#ifndef OPENACC_CG_OPS
#define OPENACC_CG_OPS
include "mlir/Interfaces/InferTypeOpInterface.td"
// This file is intended to be included from OpenACCOps.td, which provides
// the necessary includes and definitions. The operations defined here use
// types and definitions from that file.
//===----------------------------------------------------------------------===//
// acc.reduction_init
//===----------------------------------------------------------------------===//
def OpenACC_ReductionInitOp
: OpenACC_Op<"reduction_init",
[SameOperandsAndResultType, RecursiveMemoryEffects,
DeclareOpInterfaceMethods<RegionBranchOpInterface,
["getRegionInvocationBounds",
"getSuccessorInputs"]>,
SingleBlockImplicitTerminator<"YieldOp">]> {
let summary = "Allocate and initialize a reduction variable from a recipe";
let description = [{
This operation provides materialized allocation and initialization for a
private reduction variable from an OpenACC reduction recipe. The region
contains the recipe's init code and must yield a single value (the private
reduction storage) via `acc.yield`.
The var operand is the original/shared reduction variable. The
reduction_operator specifies the reduction kind (e.g. add, mul).
}];
let arguments = (ins OpenACC_AnyPointerOrMappableType:$var,
OpenACC_ReductionOperatorAttr:$reductionOperator);
let results = (outs OpenACC_AnyPointerOrMappableType:$result);
let regions = (region AnyRegion:$region);
let assemblyFormat = [{
$var $reductionOperator `:` type($result) $region attr-dict
}];
let hasVerifier = 1;
}
//===----------------------------------------------------------------------===//
// acc.reduction_combine_region
//===----------------------------------------------------------------------===//
def OpenACC_ReductionCombineRegionOp
: OpenACC_Op<"reduction_combine_region",
[SameTypeOperands, RecursiveMemoryEffects,
DeclareOpInterfaceMethods<RegionBranchOpInterface,
["getRegionInvocationBounds",
"getSuccessorInputs"]>,
SingleBlockImplicitTerminator<"YieldOp">]> {
let summary = "Combine a reduction private value with its original (recipe)";
let description = [{
This operation provides materialized reduction combine code from an
OpenACC reduction recipe. The region takes the partially reduced value(s)
from the private reduction variable and combines them with the current
value(s) in the original/shared reduction variable. The region is
terminated by `acc.yield` with no operands.
The destVar operand is the original/shared reduction variable.
The srcVar operand is typically the result of acc.reduction_init.
}];
let arguments = (ins OpenACC_AnyPointerOrMappableType:$destVar,
OpenACC_AnyPointerOrMappableType:$srcVar);
let results = (outs);
let regions = (region AnyRegion:$region);
let assemblyFormat = [{
$srcVar `into` $destVar `:` type($srcVar) $region
attr-dict
}];
let hasVerifier = 1;
}
//===----------------------------------------------------------------------===//
// acc.reduction_combine
//===----------------------------------------------------------------------===//

View File

@ -2844,7 +2844,8 @@ def OpenACC_LoopOp
// Yield operation for the acc.loop and acc.parallel operations.
def OpenACC_YieldOp : OpenACC_Op<"yield", [Pure, ReturnLike, Terminator,
ParentOneOf<["FirstprivateRecipeOp, LoopOp, ParallelOp, PrivateRecipeOp,"
"ReductionRecipeOp, SerialOp, AtomicUpdateOp"]>]> {
"ReductionRecipeOp, ReductionInitOp, ReductionCombineRegionOp,"
"SerialOp, AtomicUpdateOp"]>]> {
let summary = "Acc yield and termination operation";
let description = [{

View File

@ -13,6 +13,10 @@
#ifndef MLIR_DIALECT_OPENACC_OPENACCUTILSLOOP_H_
#define MLIR_DIALECT_OPENACC_OPENACCUTILSLOOP_H_
#include "mlir/IR/Block.h"
#include "mlir/IR/ValueRange.h"
#include "llvm/ADT/SmallVector.h"
namespace mlir {
class IRMapping;
class Location;
@ -26,6 +30,16 @@ class ExecuteRegionOp;
namespace acc {
class LoopOp;
/// Clone an ACC region into a destination block at the given insertion point.
/// Requires a single-block source region. Maps block arguments and optional
/// result replacement: values in resultsToReplace are replaced with the
/// operands of the cloned region's acc.yield (1:1). Erases acc.yield/terminator
/// and merges blocks. Returns (replacement values, insertion point after
/// clone).
std::pair<llvm::SmallVector<Value>, Block::iterator>
cloneACCRegionInto(Region *src, Block *dest, Block::iterator inlinePoint,
IRMapping &mapping, ValueRange resultsToReplace);
/// Wrap a multi-block region in an scf.execute_region.
/// Clones the given region into a new scf.execute_region, replacing
/// acc.yield/acc.terminator with scf.yield. Use this to convert unstructured

View File

@ -372,6 +372,18 @@ def OffloadLiveInValueCanonicalization : Pass<"offload-livein-value-canonicaliza
let dependentDialects = ["mlir::acc::OpenACCDialect"];
}
def ACCRecipeMaterialization : Pass<"acc-recipe-materialization", "mlir::ModuleOp"> {
let summary = "Materialize OpenACC private, firstprivate and reduction recipes";
let description = [{
Materializes OpenACC privatization, firstprivate and reduction recipes by
cloning init, copy, combiner, and destroy into the construct. Replaces recipe
references with materialized values (including acc.reduction_init and
acc.reduction_combine_region for reductions) and removes unused recipe
symbols.
}];
let dependentDialects = ["mlir::acc::OpenACCDialect", "mlir::arith::ArithDialect"];
}
def OffloadTargetVerifier : Pass<"offload-target-verifier", "mlir::func::FuncOp"> {
let summary = "Verify values and symbols live into offload regions for legality";
let description = [{

View File

@ -247,6 +247,69 @@ void FirstprivateMapInitialOp::getEffects(
addResultEffect<MemoryEffects::Write>(effects, getAccVar());
}
//===----------------------------------------------------------------------===//
// ReductionInitOp
//===----------------------------------------------------------------------===//
void ReductionInitOp::getSuccessorRegions(
RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
getSingleRegionOpSuccessorRegions(getOperation(), getRegion(), point,
regions);
}
void ReductionInitOp::getRegionInvocationBounds(
ArrayRef<Attribute> operands,
SmallVectorImpl<InvocationBounds> &invocationBounds) {
invocationBounds.emplace_back(1, 1);
}
ValueRange ReductionInitOp::getSuccessorInputs(RegionSuccessor successor) {
return getSingleRegionSuccessorInputs(getOperation(), successor);
}
LogicalResult ReductionInitOp::verify() {
Block &block = getRegion().front();
if (auto yieldOp = dyn_cast<acc::YieldOp>(block.getTerminator())) {
if (yieldOp.getNumOperands() != 1)
return emitOpError(
"region must yield exactly one value (private storage)");
if (yieldOp.getOperand(0).getType() != getVar().getType())
return emitOpError("yielded value type must match var type");
}
return success();
}
//===----------------------------------------------------------------------===//
// ReductionCombineRegionOp
//===----------------------------------------------------------------------===//
void ReductionCombineRegionOp::getSuccessorRegions(
RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
getSingleRegionOpSuccessorRegions(getOperation(), getRegion(), point,
regions);
}
void ReductionCombineRegionOp::getRegionInvocationBounds(
ArrayRef<Attribute> operands,
SmallVectorImpl<InvocationBounds> &invocationBounds) {
invocationBounds.emplace_back(1, 1);
}
ValueRange
ReductionCombineRegionOp::getSuccessorInputs(RegionSuccessor successor) {
return getSingleRegionSuccessorInputs(getOperation(), successor);
}
LogicalResult ReductionCombineRegionOp::verify() {
Block &block = getRegion().front();
if (auto yieldOp = dyn_cast<acc::YieldOp>(block.getTerminator())) {
if (yieldOp.getNumOperands() != 0)
return emitOpError("region must be terminated by acc.yield with no "
"operands");
}
return success();
}
//===----------------------------------------------------------------------===//
// ReductionCombineOp
//===----------------------------------------------------------------------===//

View File

@ -0,0 +1,459 @@
//===- ACCRecipeMaterialization.cpp - Materialize ACC recipes -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Overview:
// ---------
// OpenACC compute constructs (acc.parallel, acc.serial, acc.kernels) and
// acc.loop can carry data clauses (acc.private, acc.firstprivate,
// acc.reduction) that refer to recipes (acc.private.recipe,
// acc.firstprivate.recipe, acc.reduction.recipe). Recipes define how to
// initialize, copy, combine, or destroy a particular variable. This pass clones
// those regions into the construct and ensures the materialized SSA values are
// used instead.
//
// Transforms:
// -----------
// 1. Firstprivate: Inserts acc.firstprivate_map so the initial value is
// available on the device, then clones the recipe init and copy regions
// into the construct and replaces uses with the materialized alloca.
// Optional destroy region is cloned before the region terminator.
//
// 2. Private: Clones the recipe init region into the construct (at the
// region entry or at the loop op for acc.loop private). Replaces uses
// of the recipe result with the materialized alloca. Optional destroy
// region is cloned before the region terminator.
//
// 3. Reduction: Creates acc.reduction_init (init region inlined) and
// acc.reduction_combine_region (combiner region inlined). Uses within
// the region are updated to the reduction init result.
//
// Requirements:
// -------------
// 1. OpenACCSupport: The pass uses the `acc::OpenACCSupport` analysis
// including emitNYI for unsupported cases.
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
#include "mlir/Dialect/OpenACC/OpenACC.h"
#include "mlir/Dialect/OpenACC/OpenACCUtils.h"
#include "mlir/Dialect/OpenACC/OpenACCUtilsLoop.h"
#include "mlir/Dialect/OpenACC/Transforms/Passes.h"
#include "mlir/IR/Block.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/IR/SymbolTable.h"
#include "mlir/IR/Value.h"
#include "mlir/IR/ValueRange.h"
#include "mlir/Interfaces/LoopLikeInterface.h"
#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
namespace mlir {
namespace acc {
#define GEN_PASS_DEF_ACCRECIPEMATERIALIZATION
#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc"
} // namespace acc
} // namespace mlir
#define DEBUG_TYPE "acc-recipe-materialization"
namespace {
using namespace mlir;
static void saveVarName(StringRef name, Value dst) {
if (name.empty())
return;
if (Operation *dstOp = dst.getDefiningOp()) {
if (dstOp->getAttrOfType<acc::VarNameAttr>(acc::getVarNameAttrName()))
return;
if (isa<ACC_DATA_ENTRY_OPS>(dstOp))
return;
dstOp->setAttr(acc::getVarNameAttrName(),
acc::VarNameAttr::get(dstOp->getContext(), name));
return;
}
auto blockArg = dyn_cast<BlockArgument>(dst);
if (!blockArg)
return;
Block *block = blockArg.getOwner();
Region *region = block ? block->getParent() : nullptr;
if (!region || !block->isEntryBlock())
return;
Operation *parent = region->getParentOp();
if (!parent)
return;
auto funcOp = dyn_cast<FunctionOpInterface>(parent);
if (!funcOp)
return;
unsigned argIdx = blockArg.getArgNumber();
if (argIdx >= funcOp.getNumArguments())
return;
if (funcOp.getArgAttr(argIdx, acc::getVarNameAttrName()))
return;
funcOp.setArgAttr(argIdx, acc::getVarNameAttrName(),
acc::VarNameAttr::get(parent->getContext(), name));
}
static void saveVarName(Value src, Value dst) {
saveVarName(acc::getVariableName(src), dst);
}
// Clone the destroy region of the recipe before the terminator of the provided
// block. Values must be provided for the destroy region block arguments
// according to the recipe specifications.
template <typename RecipeOpTy>
static void cloneDestroy(RecipeOpTy recipe, mlir::Block *block,
const llvm::SmallVector<mlir::Value> &arguments) {
IRMapping mapping{};
Region &destroyRegion = recipe.getDestroyRegion();
assert(destroyRegion.getBlocks().front().getNumArguments() ==
arguments.size() &&
"unexpected acc recipe destroy block arguments");
mapping.map(destroyRegion.getBlocks().front().getArguments(), arguments);
acc::cloneACCRegionInto(&destroyRegion, block, std::prev(block->end()),
mapping,
/*resultsToReplace=*/{});
}
class ACCRecipeMaterialization
: public acc::impl::ACCRecipeMaterializationBase<ACCRecipeMaterialization> {
public:
using acc::impl::ACCRecipeMaterializationBase<
ACCRecipeMaterialization>::ACCRecipeMaterializationBase;
void runOnOperation() override;
private:
// When handling firstprivate, the initial value needs to be available on
// the GPU. One way to get that value there is to map the variable through
// global memory.
// Thus, when we materialize a firstprivate, we materialize it into
// a mapping action first. This function ends up with doing the following:
// %dev = acc.firstprivate var(%var)
// =>
// %copy = acc.firstprivate_map var(%var)
// %dev = acc.firstprivate var(%copy)
// When the recipe materialization happens, the `acc.firstprivate` ends up
// being removed. But because of the way we chain it to the
// `acc.firstprivate_map`, then its result becomes live-in to the
// compute region and used as the variable the initial value is loaded from.
void handleFirstprivateMapping(acc::FirstprivateOp firstprivateOp) const;
template <typename OpTy>
void removeRecipe(OpTy op, ModuleOp moduleOp) const;
template <typename OpTy, typename RecipeOpTy, typename AccOpTy>
LogicalResult materialize(OpTy op, RecipeOpTy recipe, AccOpTy accOp,
acc::OpenACCSupport &accSupport) const;
template <typename OpTy>
LogicalResult materializeForACCOp(OpTy accOp,
acc::OpenACCSupport &accSupport) const;
};
void ACCRecipeMaterialization::handleFirstprivateMapping(
acc::FirstprivateOp firstprivateOp) const {
OpBuilder builder(firstprivateOp);
auto mapFirstprivateOp = acc::FirstprivateMapInitialOp::create(
builder, firstprivateOp.getLoc(), firstprivateOp.getVar(),
firstprivateOp.getStructured(), firstprivateOp.getImplicit(),
firstprivateOp.getBounds());
mapFirstprivateOp.setName(firstprivateOp.getName());
firstprivateOp.getVarMutable().assign(mapFirstprivateOp.getAccVar());
}
template <typename OpTy>
void ACCRecipeMaterialization::removeRecipe(OpTy op, ModuleOp moduleOp) const {
auto recipeName = op.getNameAttr();
if (SymbolTable::symbolKnownUseEmpty(recipeName, moduleOp)) {
LLVM_DEBUG(llvm::dbgs() << "erasing recipe: " << recipeName << "\n");
op.erase();
} else {
LLVM_DEBUG({
std::optional<SymbolTable::UseRange> symbolUses =
op.getSymbolUses(moduleOp);
if (symbolUses.has_value()) {
for (SymbolTable::SymbolUse symbolUse : *symbolUses) {
llvm::dbgs() << "symbol use: ";
symbolUse.getUser()->dump();
}
}
});
llvm_unreachable("expected no use of recipe symbol");
}
}
template <typename OpTy, typename RecipeOpTy, typename AccOpTy>
LogicalResult
ACCRecipeMaterialization::materialize(OpTy op, RecipeOpTy recipe, AccOpTy accOp,
acc::OpenACCSupport &accSupport) const {
Region &region = accOp.getRegion();
Value origPtr = op.getVar();
Value accPtr = op.getAccVar();
assert(accPtr && "invalid op: null acc var");
OpBuilder b(op);
SmallVector<Value> triples;
// Clone init block into the region at the insertion point specified.
Region &initRegion = recipe.getInitRegion();
unsigned initNumArguments =
initRegion.getBlocks().front().getArguments().size();
if (initNumArguments > 1) {
// Code from C/C++ will most likely only provide extent arguments to the
// recipe arguments.
if ((initNumArguments - 1) % 3 != 0) {
(void)accSupport.emitNYI(recipe.getLoc(),
"privatization of array section with extents");
return failure();
}
// The remaining arguments must be the bounds triples
// (lower-bound, upper-bound, step), ...
unsigned argIdx = 1;
// Cast the given value to the type of the combiner region's argument
// at position argIdx, and increment argIdx.
auto castValueToArgType = [&](Location loc, Value v) {
return convertScalarToDtype(
b, loc, v,
initRegion.getBlocks().front().getArgument(argIdx++).getType(),
/*isUnsignedCast=*/false);
};
for (Value bound : acc::getBounds(op)) {
auto dataBound = bound.getDefiningOp<acc::DataBoundsOp>();
assert(dataBound &&
"acc.reduction's bound must be defined by acc.bounds");
// NOTE: we should probably generate get_lowerbound, get_upperbound
// and get_stride here, so that we can stop looking for the acc.bounds
// operation above, and just use the `bound` value.
Value lb =
castValueToArgType(dataBound.getLoc(), dataBound.getLowerbound());
Value ub =
castValueToArgType(dataBound.getLoc(), dataBound.getUpperbound());
Value step =
castValueToArgType(dataBound.getLoc(), dataBound.getStride());
triples.append({lb, ub, step});
}
assert(triples.size() + 1 == initNumArguments &&
"mismatch between number bounds and number of recipe init block "
"arguments");
}
IRMapping mapping;
SmallVector<Value> initArgs{origPtr};
initArgs.append(triples);
mapping.map(initRegion.getBlocks().front().getArguments(), initArgs);
if constexpr (std::is_same_v<OpTy, acc::PrivateOp>) {
// Clone the init region for a private.
Block *block = &region.front();
auto [results, ip] = acc::cloneACCRegionInto(
&initRegion, block, block->begin(), mapping, {accPtr});
assert(results.size() == 1 && "expected single result from init region");
saveVarName(op.getAccVar(), results[0]);
// Clone the destroy region for a private, if it exists.
if (!recipe.getDestroyRegion().empty()) {
results.insert(results.begin(), origPtr);
results.append(triples);
cloneDestroy(recipe, block, results);
}
} else if constexpr (std::is_same_v<OpTy, acc::FirstprivateOp>) {
// Clone the init region for a firstprivate.
Block *block = &region.front();
auto [results, ip] = acc::cloneACCRegionInto(
&initRegion, block, block->begin(), mapping, {accPtr});
assert(results.size() == 1 && "expected single result from init region");
saveVarName(op.getAccVar(), results[0]);
// We want the copy to store the origPtr to private
results.insert(results.begin(), origPtr);
results.append(triples);
// Clone the copy region for a firstprivate
mapping.clear();
mapping.map(recipe.getCopyRegion().front().getArguments(), results);
// Clone the copy region for a firstprivate.
acc::cloneACCRegionInto(&recipe.getCopyRegion(), block, std::next(ip),
mapping, {});
if (!recipe.getDestroyRegion().empty()) {
// origPtr was already pushed.
cloneDestroy(recipe, block, results);
}
} else if constexpr (std::is_same_v<OpTy, acc::ReductionOp>) {
auto cloneRegionIntoAccRegion = [&](Region *src, Region *dest,
bool hasResult) {
src->cloneInto(dest, mapping);
Block *block = &dest->front();
Operation *terminator = block->getTerminator();
b.setInsertionPoint(terminator);
if (hasResult)
acc::YieldOp::create(b, op.getLoc(), terminator->getOperands());
else
acc::YieldOp::create(b, op.getLoc(), ValueRange{});
terminator->erase();
};
// Clone the init region into acc.reduction_init.
if constexpr (std::is_same_v<AccOpTy, acc::ParallelOp>)
b.setInsertionPointToStart(&region.front());
else if constexpr (std::is_same_v<AccOpTy, acc::LoopOp>)
b.setInsertionPoint(op);
else
llvm_unreachable("unexpected acc op with reduction recipe");
auto reductionOp = acc::ReductionInitOp::create(
b, op.getLoc(), origPtr, recipe.getReductionOperatorAttr());
saveVarName(op.getAccVar(), reductionOp.getResult());
cloneRegionIntoAccRegion(&initRegion, &reductionOp.getRegion(),
/*hasResult=*/true);
// Update the uses within the loop to use the reduction op result.
replaceAllUsesInRegionWith(accPtr, reductionOp.getResult(), region);
// Clone the combiner region into acc.reduction_combine_region.
Region &combinerRegion = recipe.getCombinerRegion();
Block *entryBlock = &combinerRegion.front();
if constexpr (std::is_same_v<AccOpTy, acc::ParallelOp>)
b.setInsertionPoint(region.back().getTerminator());
else if constexpr (std::is_same_v<AccOpTy, acc::LoopOp>)
b.setInsertionPointAfter(accOp);
else
llvm_unreachable("unexpected acc op with reduction recipe");
// Map the first two block arguments to the original and private
// reduction variables. If the recipe's combiner region has the bounds
// arguments, we have to map them to the corresponding operands of
// acc.reduction operation.
mapping.clear();
SmallVector<Value, 2> argsRemapping{origPtr, reductionOp.getResult()};
argsRemapping.append(triples);
mapping.map(entryBlock->getArguments(), argsRemapping);
auto combineRegionOp = acc::ReductionCombineRegionOp::create(
b, op.getLoc(), origPtr, reductionOp.getResult());
cloneRegionIntoAccRegion(&combinerRegion, &combineRegionOp.getRegion(),
/*hasResult=*/false);
auto setSeqParDimsForRecipeLoops = [](Region *r) {
r->walk([](LoopLikeOpInterface loopLike) {
loopLike->setAttr(
acc::GPUParallelDimsAttr::name,
acc::GPUParallelDimsAttr::seq(loopLike->getContext()));
});
};
setSeqParDimsForRecipeLoops(&reductionOp.getRegion());
setSeqParDimsForRecipeLoops(&combineRegionOp.getRegion());
if (!recipe.getDestroyRegion().empty()) {
(void)accSupport.emitNYI(
recipe.getLoc(),
"OpenACC reduction variable that requires destruction code");
return failure();
}
} else {
llvm_unreachable("unexpected op type");
}
op.erase();
return success();
}
template <typename OpTy>
LogicalResult ACCRecipeMaterialization::materializeForACCOp(
OpTy accOp, acc::OpenACCSupport &accSupport) const {
assert(isa<ACC_COMPUTE_CONSTRUCT_AND_LOOP_OPS>(accOp));
if (!accOp.getFirstprivateOperands().empty()) {
// Clear the firstprivate operands list so there will be no uses after
// the recipe is materialized.
SmallVector<Value> operands(accOp.getFirstprivateOperands());
accOp.getFirstprivateOperandsMutable().clear();
for (Value operand : operands) {
auto firstprivateOp = cast<acc::FirstprivateOp>(operand.getDefiningOp());
auto symbolRef = cast<SymbolRefAttr>(firstprivateOp.getRecipeAttr());
auto decl = SymbolTable::lookupNearestSymbolFrom(accOp, symbolRef);
auto recipeOp = cast<acc::FirstprivateRecipeOp>(decl);
LLVM_DEBUG(llvm::dbgs() << "materializing: " << firstprivateOp << "\n"
<< symbolRef << "\n");
handleFirstprivateMapping(firstprivateOp);
if (failed(materialize(firstprivateOp, recipeOp, accOp, accSupport)))
return failure();
}
}
if (!accOp.getPrivateOperands().empty()) {
// Clear the private operands list so there will be no uses after
// the recipe is materialized.
SmallVector<Value> operands(accOp.getPrivateOperands());
accOp.getPrivateOperandsMutable().clear();
for (Value operand : operands) {
auto privateOp = cast<acc::PrivateOp>(operand.getDefiningOp());
auto symbolRef = cast<SymbolRefAttr>(privateOp.getRecipeAttr());
auto decl = SymbolTable::lookupNearestSymbolFrom(accOp, symbolRef);
auto recipeOp = cast<acc::PrivateRecipeOp>(decl);
LLVM_DEBUG(llvm::dbgs() << "materializing: " << privateOp << "\n"
<< symbolRef << "\n");
if (failed(materialize(privateOp, recipeOp, accOp, accSupport)))
return failure();
}
}
if (!accOp.getReductionOperands().empty()) {
// Clear the reduction operands list so there will be no uses after
// the recipe is materialized.
SmallVector<Value> operands(accOp.getReductionOperands());
accOp.getReductionOperandsMutable().clear();
for (Value operand : operands) {
auto reductionOp = cast<acc::ReductionOp>(operand.getDefiningOp());
auto symbolRef = cast<SymbolRefAttr>(reductionOp.getRecipeAttr());
auto decl = SymbolTable::lookupNearestSymbolFrom(accOp, symbolRef);
auto recipeOp = cast<acc::ReductionRecipeOp>(decl);
LLVM_DEBUG(llvm::dbgs() << "materializing: " << reductionOp << "\n"
<< symbolRef << "\n");
if (failed(materialize(reductionOp, recipeOp, accOp, accSupport)))
return failure();
}
}
return success();
}
void ACCRecipeMaterialization::runOnOperation() {
ModuleOp moduleOp = getOperation();
acc::OpenACCSupport &accSupport = getAnalysis<acc::OpenACCSupport>();
// Materialize all recipes for all compute constructs and loop constructs.
bool anyFailed = false;
moduleOp.walk([&](Operation *op) {
if (anyFailed)
return;
TypeSwitch<Operation *>(op).Case<ACC_COMPUTE_CONSTRUCT_AND_LOOP_OPS>(
[&](auto constructOp) {
if (failed(materializeForACCOp(constructOp, accSupport)))
anyFailed = true;
});
});
if (anyFailed) {
signalPassFailure();
return;
}
// Remove all recipes.
moduleOp.walk([&](Operation *op) {
if (auto recipe = dyn_cast<acc::ReductionRecipeOp>(op))
removeRecipe(recipe, moduleOp);
else if (auto recipe = dyn_cast<acc::PrivateRecipeOp>(op))
removeRecipe(recipe, moduleOp);
else if (auto recipe = dyn_cast<acc::FirstprivateRecipeOp>(op))
removeRecipe(recipe, moduleOp);
});
}
} // namespace

View File

@ -2,6 +2,7 @@ add_mlir_dialect_library(MLIROpenACCTransforms
ACCDeclareGPUModuleInsertion.cpp
ACCIfClauseLowering.cpp
ACCImplicitData.cpp
ACCRecipeMaterialization.cpp
ACCLoopTiling.cpp
ACCImplicitDeclare.cpp
ACCImplicitRoutine.cpp

View File

@ -18,6 +18,8 @@
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SCF/Utils/Utils.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/Support/ErrorHandling.h"
using namespace mlir;
@ -105,42 +107,16 @@ static void normalizeIVUses(OpBuilder &b, Location loc, Value iv, Value origLB,
iv.replaceAllUsesExcept(denormalized, exceptions);
}
/// Clone an ACC region into a destination block, handling the ACC terminators.
/// Returns the insertion point after the cloned operations.
static Block::iterator cloneACCRegionInto(Region *src, Block *dest,
Block::iterator insertionPoint,
IRMapping &mapping,
RewriterBase &rewriter) {
assert(src->hasOneBlock() && "expected single-block region");
Region *insertRegion = dest->getParent();
Block *postInsertBlock = rewriter.splitBlock(dest, insertionPoint);
rewriter.cloneRegionBefore(*src, *insertRegion,
postInsertBlock->getIterator(), mapping);
auto lastNewBlock = std::prev(postInsertBlock->getIterator());
Block::iterator newInsertionPoint;
Operation *terminator = lastNewBlock->getTerminator();
if (auto yieldOp = dyn_cast<acc::YieldOp>(terminator)) {
newInsertionPoint = std::prev(yieldOp->getIterator());
rewriter.eraseOp(yieldOp);
} else if (auto terminatorOp = dyn_cast<acc::TerminatorOp>(terminator)) {
newInsertionPoint = std::prev(terminatorOp->getIterator());
rewriter.eraseOp(terminatorOp);
} else {
llvm_unreachable("unexpected terminator in ACC region");
}
// Merge last block with the postInsertBlock
rewriter.mergeBlocks(postInsertBlock, &*lastNewBlock);
// Merge first block with original dest block
Block *firstNewBlock = &*std::next(dest->getIterator());
rewriter.mergeBlocks(firstNewBlock, dest);
return newInsertionPoint;
/// Helper used by loop conversion: clone region and return insertion point
/// only.
static Block::iterator cloneACCRegionIntoForLoop(Region *src, Block *dest,
Block::iterator insertionPoint,
IRMapping &mapping,
RewriterBase &rewriter) {
auto [replacements, ip] =
acc::cloneACCRegionInto(src, dest, insertionPoint, mapping, ValueRange{});
(void)replacements;
return ip;
}
} // namespace
@ -148,6 +124,49 @@ static Block::iterator cloneACCRegionInto(Region *src, Block *dest,
namespace mlir {
namespace acc {
std::pair<SmallVector<Value>, Block::iterator>
cloneACCRegionInto(Region *src, Block *dest, Block::iterator inlinePoint,
IRMapping &mapping, ValueRange resultsToReplace) {
if (!src->hasOneBlock())
llvm_unreachable("cloneACCRegionInto: multi-block region not supported "
"(requires scf.execute_region)");
Region *insertRegion = dest->getParent();
Block *postInsertBlock = dest->splitBlock(inlinePoint);
src->cloneInto(insertRegion, postInsertBlock->getIterator(), mapping);
SmallVector<Value> replacements;
Block *lastNewBlock = &*std::prev(postInsertBlock->getIterator());
Block::iterator ip;
if (auto yieldOp = dyn_cast<acc::YieldOp>(lastNewBlock->getTerminator())) {
for (auto [replacement, orig] :
llvm::zip(yieldOp.getOperands(), resultsToReplace)) {
replaceAllUsesInRegionWith(orig, replacement, *dest->getParent());
replacements.push_back(replacement);
}
ip = std::prev(yieldOp->getIterator());
yieldOp.erase();
} else {
auto terminatorOp =
dyn_cast<acc::TerminatorOp>(lastNewBlock->getTerminator());
if (!terminatorOp)
llvm_unreachable(
"cloneACCRegionInto: expected acc.yield or acc.terminator");
ip = std::prev(terminatorOp->getIterator());
terminatorOp.erase();
}
lastNewBlock->getOperations().splice(lastNewBlock->end(),
postInsertBlock->getOperations());
postInsertBlock->erase();
Block *firstNewBlock = &*std::next(dest->getIterator());
dest->getOperations().splice(dest->end(), firstNewBlock->getOperations());
firstNewBlock->erase();
return {replacements, ip};
}
/// Wrap a multi-block region with scf.execute_region.
scf::ExecuteRegionOp
wrapMultiBlockRegionWithSCFExecuteRegion(Region &region, IRMapping &mapping,
@ -232,8 +251,8 @@ scf::ForOp convertACCLoopToSCFFor(LoopOp loopOp, RewriterBase &rewriter,
mapACCLoopIVsToSCFIVs(loopOp, scfIVs, rewriter, mapping);
// Clone the loop body into the innermost scf.for
cloneACCRegionInto(&loopOp.getRegion(), forOps.back().getBody(),
rewriter.getInsertionPoint(), mapping, rewriter);
cloneACCRegionIntoForLoop(&loopOp.getRegion(), forOps.back().getBody(),
rewriter.getInsertionPoint(), mapping, rewriter);
// Optionally collapse nested loops
if (enableCollapse && forOps.size() > 1)
@ -292,8 +311,8 @@ scf::ParallelOp convertACCLoopToSCFParallel(LoopOp loopOp,
return nullptr;
}
} else {
cloneACCRegionInto(&loopOp.getRegion(), parallelOp.getBody(),
rewriter.getInsertionPoint(), mapping, rewriter);
cloneACCRegionIntoForLoop(&loopOp.getRegion(), parallelOp.getBody(),
rewriter.getInsertionPoint(), mapping, rewriter);
}
// Denormalize IV uses

View File

@ -0,0 +1,44 @@
// RUN: mlir-opt %s -acc-recipe-materialization | FileCheck %s
acc.firstprivate.recipe @firstprivatization_memref_i32 : memref<i32> init {
^bb0(%arg0: memref<i32>):
%0 = memref.alloca() : memref<i32>
acc.yield %0 : memref<i32>
} copy {
^bb0(%arg0: memref<i32>, %arg1: memref<i32>):
%0 = memref.load %arg0[] : memref<i32>
memref.store %0, %arg1[] : memref<i32>
acc.terminator
}
acc.private.recipe @privatization_memref_i32 : memref<i32> init {
^bb0(%arg0: memref<i32>):
%0 = memref.alloca() : memref<i32>
acc.yield %0 : memref<i32>
}
// Verify that the firstprivate was materialized into a copy outside the kernel
// and an alloca (as per the recipe) inside the region.
// Then ensure that all uses are of the private alloca.
// CHECK-LABEL: func.func @firstpriv
// CHECK: acc.parallel {
// CHECK: %[[ALLOCA:.*]] = memref.alloca() {acc.var_name = #acc.var_name<"t">} : memref<i32>
// CHECK: %[[FIRSTPRIVLOAD:.*]] = memref.load %{{.*}}[] : memref<i32>
// CHECK: memref.store %[[FIRSTPRIVLOAD]], %[[ALLOCA]][] : memref<i32>
// CHECK: %[[ALLOCALOAD:.*]] = memref.load %[[ALLOCA]][] : memref<i32>
// CHECK: %[[ADDI:.*]] = arith.addi %[[ALLOCALOAD]], %c1{{.*}} : i32
// CHECK: memref.store %[[ADDI]], %[[ALLOCA]][] : memref<i32>
func.func @firstpriv() {
%c1336 = arith.constant 1336 : i32
%alloc = memref.alloca() : memref<i32>
memref.store %c1336, %alloc[] : memref<i32>
%fp = acc.firstprivate varPtr(%alloc : memref<i32>) recipe(@firstprivatization_memref_i32) -> memref<i32> {implicit = true, name = "t"}
acc.parallel firstprivate(%fp : memref<i32>) {
%c1 = arith.constant 1 : i32
%v = memref.load %fp[] : memref<i32>
%add = arith.addi %v, %c1 : i32
memref.store %add, %fp[] : memref<i32>
acc.yield
}
return
}

View File

@ -0,0 +1,34 @@
// RUN: mlir-opt %s -acc-recipe-materialization | FileCheck %s
// acc.kernels with private: recipe materialized to alloca inside region
// CHECK-NOT: acc.private
// CHECK: acc.kernels dataOperands(
// CHECK: memref.alloca() {acc.var_name = #acc.var_name<"s">} : memref<i32>
acc.private.recipe @privatization_memref_i32 : memref<i32> init {
^bb0(%arg0: memref<i32>):
%0 = memref.alloca() : memref<i32>
acc.yield %0 : memref<i32>
}
func.func @kpriv_(%arg0: memref<i32>, %arg1: memref<32xi32>) {
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%iv_alloc = memref.alloca() : memref<i32>
%start = memref.load %arg0[] : memref<i32>
memref.store %start, %iv_alloc[] : memref<i32>
%copy = acc.copyin varPtr(%arg1 : memref<32xi32>) -> memref<32xi32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "a"}
%priv = acc.private varPtr(%iv_alloc : memref<i32>) recipe(@privatization_memref_i32) -> memref<i32> {implicit = true, name = "s"}
acc.kernels dataOperands(%copy : memref<32xi32>) private(%priv : memref<i32>) {
acc.loop control(%arg2 : index) = (%c1 : index) to (%c32 : index) step (%c1 : index) {
%iv = arith.index_cast %arg2 : index to i32
memref.store %iv, %iv_alloc[] : memref<i32>
%s_val = memref.load %priv[] : memref<i32>
memref.store %s_val, %copy[%arg2] : memref<32xi32>
acc.yield
} attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
acc.terminator
}
acc.copyout accPtr(%copy : memref<32xi32>) to varPtr(%arg1 : memref<32xi32>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "a"}
return
}

View File

@ -0,0 +1,46 @@
// RUN: mlir-opt %s -acc-recipe-materialization | FileCheck %s
// Test that the reduction recipes are correctly inlined when attached to a
// parallel construct without loop. Verify init and combine materialize in the region.
// CHECK-LABEL: func.func @par_reduction_clause_
// CHECK: acc.parallel {
// CHECK: [[PRIVATE:%.*]] = acc.reduction_init {{.*}} <add>
// CHECK-NEXT: [[ZERO:%.*]] = arith.constant 0.000000e+00 : f64
// CHECK-NEXT: [[ALLOCA:%.*]] = memref.alloca() : memref<f64>
// CHECK-NEXT: memref.store [[ZERO]], [[ALLOCA]][]
// CHECK-NEXT: acc.yield {{.*}}
// CHECK: } {{.*}}acc.var_name = #acc.var_name<"tmp">
// CHECK: memref.load [[PRIVATE]][]
// CHECK: memref.store {{.*}}, [[PRIVATE]][]
// CHECK: acc.reduction_combine_region [[PRIVATE]] into [[REDUCVAR:%.*]] :
// CHECK: [[LOADVAR:%.*]] = memref.load [[REDUCVAR]][]
// CHECK-NEXT: [[LOADPRIV:%.*]] = memref.load [[PRIVATE]][]
// CHECK-NEXT: [[COMBINE:%.*]] = arith.addf [[LOADVAR]], [[LOADPRIV]]
// CHECK-NEXT: memref.store [[COMBINE]], [[REDUCVAR]][]
// CHECK: acc.yield
acc.reduction.recipe @reduction_add_memref_f64 : memref<f64> reduction_operator <add> init {
^bb0(%arg0: memref<f64>):
%cst = arith.constant 0.000000e+00 : f64
%0 = memref.alloca() : memref<f64>
memref.store %cst, %0[] : memref<f64>
acc.yield %0 : memref<f64>
} combiner {
^bb0(%arg0: memref<f64>, %arg1: memref<f64>):
%0 = memref.load %arg0[] : memref<f64>
%1 = memref.load %arg1[] : memref<f64>
%2 = arith.addf %0, %1 fastmath<contract> : f64
memref.store %2, %arg0[] : memref<f64>
acc.yield %arg0 : memref<f64>
}
func.func @par_reduction_clause_(%arg0: memref<f64>) {
%cst = arith.constant 1.000000e+00 : f64
%0 = acc.reduction varPtr(%arg0 : memref<f64>) recipe(@reduction_add_memref_f64) -> memref<f64> {name = "tmp"}
acc.parallel reduction(%0 : memref<f64>) {
%1 = memref.load %0[] : memref<f64>
%2 = arith.addf %1, %cst fastmath<contract> : f64
memref.store %2, %0[] : memref<f64>
acc.yield
}
return
}

View File

@ -0,0 +1,45 @@
// RUN: mlir-opt %s -acc-recipe-materialization | FileCheck %s
acc.private.recipe @privatization_memref_i64 : memref<i64> init {
^bb0(%arg0: memref<i64>):
%0 = memref.alloca() : memref<i64>
acc.yield %0 : memref<i64>
}
// CHECK-LABEL: func.func @private_i64
// CHECK: acc.loop control([[IV:%.+]] : index)
// CHECK: [[ALLOC:%.+]] = memref.alloca() : memref<i64>
// CHECK: memref.store {{.*}}, [[ALLOC]][]
func.func @private_i64(%arg0 : memref<i64>) {
%c16 = arith.constant 16 : index
%c1 = arith.constant 1 : index
%priv = acc.private varPtr(%arg0 : memref<i64>) recipe(@privatization_memref_i64) -> memref<i64> {implicit = true, name = ""}
acc.loop private(%priv : memref<i64>) control(%siv : index) = (%c1 : index) to (%c16 : index) step (%c1 : index) {
%iv_i64 = arith.index_cast %siv : index to i64
memref.store %iv_i64, %priv[] : memref<i64>
acc.yield
} attributes {independent = [#acc.device_type<none>]}
return
}
// CHECK-LABEL: func.func @par_private_i64
// CHECK: acc.parallel {
// CHECK: [[ALLOC:%.+]] = memref.alloca() : memref<i64>
// CHECK: acc.loop control([[IV:%.+]] : index)
// CHECK: memref.store {{.*}}, [[ALLOC]][]
func.func @par_private_i64(%arg0 : memref<i64>) {
%c16 = arith.constant 16 : index
%c1 = arith.constant 1 : index
%priv = acc.private varPtr(%arg0 : memref<i64>) recipe(@privatization_memref_i64) -> memref<i64> {implicit = true, name = ""}
acc.parallel private(%priv : memref<i64>) {
acc.loop control(%siv : index) = (%c1 : index) to (%c16 : index) step (%c1 : index) {
%iv_i64 = arith.index_cast %siv : index to i64
memref.store %iv_i64, %priv[] : memref<i64>
acc.yield
} attributes {independent = [#acc.device_type<none>]}
acc.yield
}
return
}

View File

@ -0,0 +1,47 @@
// RUN: mlir-opt %s -acc-recipe-materialization | FileCheck %s
acc.reduction.recipe @reduction_add_memref_f64 : memref<f64> reduction_operator <add> init {
^bb0(%arg0: memref<f64>):
%cst = arith.constant 0.000000e+00 : f64
%0 = memref.alloca() : memref<f64>
memref.store %cst, %0[] : memref<f64>
acc.yield %0 : memref<f64>
} combiner {
^bb0(%arg0: memref<f64>, %arg1: memref<f64>):
%0 = memref.load %arg0[] : memref<f64>
%1 = memref.load %arg1[] : memref<f64>
%2 = arith.addf %0, %1 fastmath<contract> : f64
memref.store %2, %arg0[] : memref<f64>
acc.yield %arg0 : memref<f64>
}
// Verify that the reduction init and combine recipes attached to compute
// ops materialize within the region
// CHECK-LABEL: func.func @par_reduction_clause_
// CHECK: acc.parallel {
// CHECK: [[PRIVATE:%.*]] = acc.reduction_init {{.*}} <add>
// CHECK-NEXT: [[ZERO:%.*]] = arith.constant 0.000000e+00 : f64
// CHECK-NEXT: [[ALLOCA:%.*]] = memref.alloca() : memref<f64>
// CHECK-NEXT: memref.store [[ZERO]], [[ALLOCA]][]
// CHECK-NEXT: acc.yield {{.*}}
// CHECK: } {{.*}}acc.var_name = #acc.var_name<"tmp">
// CHECK: memref.load [[PRIVATE]][]
// CHECK: memref.store {{.*}}, [[PRIVATE]][]
// CHECK: acc.reduction_combine_region [[PRIVATE]] into [[REDUCVAR:%.*]] :
// CHECK: [[LOADVAR:%.*]] = memref.load [[REDUCVAR]][]
// CHECK-NEXT: [[LOADPRIV:%.*]] = memref.load [[PRIVATE]][]
// CHECK-NEXT: [[COMBINE:%.*]] = arith.addf [[LOADVAR]], [[LOADPRIV]]
// CHECK-NEXT: memref.store [[COMBINE]], [[REDUCVAR]][]
// CHECK: acc.yield
func.func @par_reduction_clause_(%arg0: memref<f64>) {
%cst = arith.constant 1.000000e+00 : f64
%red = acc.reduction varPtr(%arg0 : memref<f64>) recipe(@reduction_add_memref_f64) -> memref<f64> {name = "tmp"}
acc.parallel reduction(%red : memref<f64>) {
%3 = memref.load %red[] : memref<f64>
%4 = arith.addf %3, %cst fastmath<contract> : f64
memref.store %4, %red[] : memref<f64>
acc.yield
}
return
}

View File

@ -701,3 +701,89 @@ TEST_F(OpenACCUtilsLoopTest, UnstructuredLoopWithYieldOperandsReturnsNullptr) {
EXPECT_FALSE(exeRegionOp);
EXPECT_TRUE(errorMsg.find("not yet supported") != std::string::npos);
}
//===----------------------------------------------------------------------===//
// cloneACCRegionInto Tests
//===----------------------------------------------------------------------===//
TEST_F(OpenACCUtilsLoopTest, CloneACCRegionIntoWithYield) {
auto [module, funcOp] = createModuleWithFunc();
Block *entry = &funcOp.getBody().front();
Value c0 = createIndexConstant(0);
Value c10 = createIndexConstant(10);
Value c1 = createIndexConstant(1);
acc::LoopOp loopOp = createLoopOp({c0}, {c10}, {c1});
// Add a constant to the loop body before the yield so the region has
// something to clone besides the terminator.
Block *loopBody = &loopOp.getRegion().front();
b.setInsertionPoint(loopBody->getTerminator());
arith::ConstantOp::create(b, loc, b.getI32IntegerAttr(42));
b.setInsertionPointToEnd(entry);
func::ReturnOp::create(b, loc);
IRMapping mapping;
mapping.map(loopBody->getArgument(0), c0);
auto [replacements, ip] = acc::cloneACCRegionInto(
&loopOp.getRegion(), entry, entry->begin(), mapping, ValueRange{});
EXPECT_TRUE(replacements.empty());
// The cloned block should have been merged: constant 42 present, no acc.yield
bool hasConst42 = false;
bool hasAccYield = false;
for (Operation &op : entry->getOperations()) {
if (auto cst = dyn_cast<arith::ConstantOp>(op))
hasConst42 = hasConst42 || (cst.getValue() == b.getI32IntegerAttr(42));
hasAccYield = hasAccYield || isa<acc::YieldOp>(op);
}
EXPECT_TRUE(hasConst42);
EXPECT_FALSE(hasAccYield);
}
TEST_F(OpenACCUtilsLoopTest, CloneACCRegionIntoWithResultReplacement) {
auto [module, funcOp] = createModuleWithFunc();
Block *entry = &funcOp.getBody().front();
// Value that will be replaced by the cloned region's yield operand
Value origVal =
arith::ConstantOp::create(b, loc, b.getI32IntegerAttr(0)).getResult();
Value c0 = createIndexConstant(0);
Value c10 = createIndexConstant(10);
Value c1 = createIndexConstant(1);
acc::LoopOp loopOp = createLoopOp({c0}, {c10}, {c1});
Block *loopBody = &loopOp.getRegion().front();
b.setInsertionPoint(loopBody->getTerminator());
Value replacementVal =
arith::ConstantOp::create(b, loc, b.getI32IntegerAttr(1)).getResult();
loopBody->getTerminator()->erase();
b.setInsertionPointToEnd(loopBody);
acc::YieldOp::create(b, loc, ValueRange{replacementVal});
b.setInsertionPointToEnd(entry);
Value c1value =
arith::ConstantOp::create(b, loc, b.getI32IntegerAttr(1)).getResult();
Value addResult = arith::AddIOp::create(b, loc, origVal, c1value)
.getResult(); // use of origVal
(void)addResult;
func::ReturnOp::create(b, loc);
IRMapping mapping;
mapping.map(loopBody->getArgument(0), c0);
auto [replacements, ip] = acc::cloneACCRegionInto(
&loopOp.getRegion(), entry, entry->begin(), mapping, ValueRange{origVal});
ASSERT_EQ(replacements.size(), 1u);
// The addi should now use the replacement (constant 1), not origVal
bool addiUsesReplacement = false;
for (Operation &op : entry->getOperations()) {
if (auto addi = dyn_cast<arith::AddIOp>(op))
addiUsesReplacement = (addi.getLhs() == replacements[0]);
}
EXPECT_TRUE(addiUsesReplacement);
}