[mlir][OpenMP] Allow tile composition (#185380)

The verifier of the TileOp did not allow composition of multiple
transformations out of precaution. However, composition works, therefore
remove the "currently only supports omp.canonical_loop as applyee" check
and add regression tests.
This commit is contained in:
Michael Kruse 2026-03-10 13:58:23 +01:00 committed by GitHub
parent 2826924543
commit cd3cab70fd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 337 additions and 38 deletions

View File

@ -3802,42 +3802,47 @@ static ParseResult parseLoopTransformClis(
return success();
}
LogicalResult TileOp::verify() {
if (getApplyees().empty())
return emitOpError() << "must apply to at least one loop";
if (getSizes().size() != getApplyees().size())
return emitOpError() << "there must be one tile size for each applyee";
if (!getGeneratees().empty() &&
2 * getSizes().size() != getGeneratees().size())
return emitOpError()
<< "expecting two times the number of generatees than applyees";
DenseSet<Value> parentIVs;
Value parent = getApplyees().front();
for (auto &&applyee : llvm::drop_begin(getApplyees())) {
auto [parentCreate, parentGen, parentCons] = decodeCli(parent);
/// Check properties of the loop nest consisting of the transformation's
/// applyees:
/// 1. They are nested inside each other
/// 2. They are perfectly nested
/// (no code with side-effects in-between the loops)
/// 3. They are rectangular
/// (loop bounds are invariant in respect to the outer loops)
///
/// TODO: Generalize for LoopTransformationInterface.
static LogicalResult checkApplyeesNesting(TileOp op) {
// Collect the loops from the nest
bool isOnlyCanonLoops = true;
SmallVector<CanonicalLoopOp> canonLoops;
for (Value applyee : op.getApplyees()) {
auto [create, gen, cons] = decodeCli(applyee);
if (!parentGen)
return emitOpError() << "applyee CLI has no generator";
if (!gen)
return op.emitOpError() << "applyee CLI has no generator";
auto parentLoop = dyn_cast_or_null<CanonicalLoopOp>(parentGen->getOwner());
if (!parentGen)
return emitOpError()
<< "currently only supports omp.canonical_loop as applyee";
auto loop = dyn_cast_or_null<CanonicalLoopOp>(gen->getOwner());
canonLoops.push_back(loop);
if (!loop)
isOnlyCanonLoops = false;
}
// FIXME: We currently can only verify non-rectangularity and perfect nest of
// omp.canonical_loop.
if (!isOnlyCanonLoops)
return success();
DenseSet<Value> parentIVs;
for (auto i : llvm::seq<int>(1, canonLoops.size())) {
auto parentLoop = canonLoops[i - 1];
auto loop = canonLoops[i];
if (parentLoop.getOperation() != loop.getOperation()->getParentOp())
return op.emitOpError()
<< "tiled loop nest must be nested within each other";
parentIVs.insert(parentLoop.getInductionVar());
if (!gen)
return emitOpError() << "applyee CLI has no generator";
auto loop = dyn_cast_or_null<CanonicalLoopOp>(gen->getOwner());
if (!loop)
return emitOpError()
<< "currently only supports omp.canonical_loop as applyee";
// Canonical loop must be perfectly nested, i.e. the body of the parent must
// only contain the omp.canonical_loop of the nested loops, and
// omp.terminator
@ -3862,12 +3867,10 @@ LogicalResult TileOp::verify() {
return true;
}();
if (!isPerfectlyNested)
return emitOpError() << "tiled loop nest must be perfectly nested";
return op.emitOpError() << "tiled loop nest must be perfectly nested";
if (parentIVs.contains(loop.getTripCount()))
return emitOpError() << "tiled loop nest must be rectangular";
parent = applyee;
return op.emitOpError() << "tiled loop nest must be rectangular";
}
// TODO: The tile sizes must be computed before the loop, but checking this
@ -3884,6 +3887,21 @@ LogicalResult TileOp::verify() {
return success();
}
LogicalResult TileOp::verify() {
if (getApplyees().empty())
return emitOpError() << "must apply to at least one loop";
if (getSizes().size() != getApplyees().size())
return emitOpError() << "there must be one tile size for each applyee";
if (!getGeneratees().empty() &&
2 * getSizes().size() != getGeneratees().size())
return emitOpError()
<< "expecting two times the number of generatees than applyees";
return checkApplyeesNesting(*this);
}
std::pair<unsigned, unsigned> TileOp ::getApplyeesODSOperandIndexAndLength() {
return getODSOperandIndexAndLength(odsIndex_applyees);
}

View File

@ -136,3 +136,73 @@ func.func @omp_tile_3d_pretty(%tc : i32, %ts : i32) -> () {
omp.tile (%grid1, %grid2, %grid3, %intratile1, %intratile2, %intratile3) <- (%cli_outer, %cli_middle, %cli_inner) sizes(%ts, %ts, %ts: i32, i32, i32)
return
}
// Composition of multiple tilings
// CHECK-LABEL: @omp_tile_composition(
// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32, %[[grid_ts:.+]]: i32, %[[intratile_ts:.+]]: i32) {
func.func @omp_tile_composition(%tc: i32, %ts: i32, %grid_ts: i32, %intratile_ts: i32) -> () {
%canonloop = omp.new_cli
%grid = omp.new_cli
%intratile = omp.new_cli
%grid_intratile = omp.new_cli
%grid_grid = omp.new_cli
%intratile_grid = omp.new_cli
%intratile_intratile = omp.new_cli
// CHECK: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
// CHECK: omp.terminator
omp.terminator
}
// CHECK: omp.tile (%grid1, %intratile1) <- (%canonloop) sizes(%[[ts]] : i32)
omp.tile(%grid, %intratile) <- (%canonloop) sizes(%ts : i32)
// CHECK: omp.tile (%grid1_1, %intratile1_0) <- (%grid1) sizes(%[[grid_ts]] : i32)
omp.tile(%grid_grid, %grid_intratile) <- (%grid) sizes(%grid_ts : i32)
// CHECK: omp.tile (%grid1_2, %intratile1_3) <- (%intratile1) sizes(%[[intratile_ts]] : i32)
omp.tile(%intratile_grid, %intratile_intratile) <- (%intratile) sizes(%intratile_ts : i32)
// CHECK: return
return
}
// Composition of multiple 2d-tilings
// CHECK-LABEL: @omp_tile_2d_composition(
// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32, %[[ts:.+]]: i32) {
func.func @omp_tile_2d_composition(%tc1: i32, %tc2: i32, %ts: i32) -> () {
%cli_outer = omp.new_cli
%cli_inner = omp.new_cli
%outer_grid = omp.new_cli
%inner_grid = omp.new_cli
%outer_intratile = omp.new_cli
%inner_intratile = omp.new_cli
%outer_grid_grid = omp.new_cli
%inner_grid_grid = omp.new_cli
%outer_grid_intratile = omp.new_cli
%inner_grid_intratile = omp.new_cli
%outer_intratile_grid = omp.new_cli
%inner_intratile_grid = omp.new_cli
%outer_intratile_intratile = omp.new_cli
%inner_intratile_intratile = omp.new_cli
// CHECK: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc1]]) {
omp.canonical_loop(%cli_outer) %iv_outer : i32 in range(%tc1) {
// CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc2]]) {
omp.canonical_loop(%cli_inner) %iv_inner : i32 in range(%tc2) {
// CHECK: omp.terminator
omp.terminator
}
// CHECK: omp.terminator
omp.terminator
}
// CHECK: omp.tile (%grid1, %grid2, %intratile1, %intratile2) <- (%canonloop, %canonloop_d1) sizes(%[[ts]], %[[ts]] : i32, i32)
omp.tile(%outer_grid, %inner_grid, %outer_intratile, %inner_intratile) <- (%cli_outer, %cli_inner) sizes(%ts, %ts : i32, i32)
// CHECK: omp.tile (%grid1_0, %grid2_1, %intratile1_2, %intratile2_3) <- (%grid1, %grid2) sizes(%[[ts]], %[[ts]] : i32, i32)
omp.tile(%outer_grid_grid, %inner_grid_grid, %outer_grid_intratile, %inner_grid_intratile) <- (%outer_grid, %inner_grid) sizes(%ts, %ts : i32, i32)
// CHECK: omp.tile (%grid1_4, %grid2_5, %intratile1_6, %intratile2_7) <- (%intratile2, %intratile1) sizes(%[[ts]], %[[ts]] : i32, i32)
omp.tile(%outer_intratile_grid, %inner_intratile_grid, %outer_intratile_intratile, %inner_intratile_intratile) <- (%inner_intratile, %outer_intratile) sizes(%ts, %ts : i32, i32)
// CHECK: return
return
}

View File

@ -39,10 +39,10 @@ func.func @missing_generator(%tc : i32, %ts : i32) {
func.func @insufficient_sizes(%tc : i32, %ts : i32) {
%canonloop1 = omp.new_cli
%canonloop2 = omp.new_cli
omp.canonical_loop(%canonloop1) %iv : i32 in range(%tc) {
omp.terminator
}
omp.canonical_loop(%canonloop2) %iv : i32 in range(%tc) {
omp.canonical_loop(%canonloop1) %iv1 : i32 in range(%tc) {
omp.canonical_loop(%canonloop2) %iv2 : i32 in range(%tc) {
omp.terminator
}
omp.terminator
}
@ -83,6 +83,24 @@ func.func @insufficient_generatees(%tc : i32, %ts : i32) {
// -----
func.func @not_nested(%tc : i32, %ts : i32) {
%canonloop1 = omp.new_cli
%canonloop2 = omp.new_cli
omp.canonical_loop(%canonloop1) %iv1 : i32 in range(%tc) {
omp.terminator
}
omp.canonical_loop(%canonloop2) %iv2 : i32 in range(%tc) {
omp.terminator
}
// expected-error@+1 {{'omp.tile' op tiled loop nest must be nested within each other}}
omp.tile <-(%canonloop1, %canonloop2) sizes(%ts, %ts : i32, i32)
llvm.return
}
// -----
func.func @not_perfectly_nested(%tc : i32, %ts : i32) {
%canonloop1 = omp.new_cli
%canonloop2 = omp.new_cli

View File

@ -0,0 +1,193 @@
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope
llvm.func @tile_composition(%baseptr: !llvm.ptr, %tc: i32, %ts: i32, %grid_ts: i32, %intratile_ts: i32) -> () {
%canonloop = omp.new_cli
%grid = omp.new_cli
%intratile = omp.new_cli
%grid_intratile = omp.new_cli
%grid_grid = omp.new_cli
%intratile_grid = omp.new_cli
%intratile_intratile = omp.new_cli
omp.canonical_loop(%canonloop) %idx : i32 in range(%tc) {
%ptr = llvm.getelementptr inbounds %baseptr[%idx] : (!llvm.ptr, i32) -> !llvm.ptr, f32
%val = llvm.mlir.constant(42.0 : f32) : f32
llvm.store %val, %ptr : f32, !llvm.ptr
omp.terminator
}
omp.tile(%grid, %intratile) <- (%canonloop) sizes(%ts : i32)
omp.tile(%grid_grid, %grid_intratile) <- (%grid) sizes(%grid_ts : i32)
omp.tile(%intratile_grid, %intratile_intratile) <- (%intratile) sizes(%intratile_ts : i32)
llvm.return
}
// CHECK-LABEL: define void @tile_composition(
// CHECK-SAME: ptr %[[TMP0:.+]], i32 %[[TMP1:.+]], i32 %[[TMP2:.+]], i32 %[[TMP3:.+]], i32 %[[TMP4:.+]]) {
// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER]]:
// CHECK-NEXT: %[[TMP6:.+]] = udiv i32 %[[TMP1:.+]], %[[TMP2:.+]]
// CHECK-NEXT: %[[TMP7:.+]] = urem i32 %[[TMP1:.+]], %[[TMP2:.+]]
// CHECK-NEXT: %[[TMP8:.+]] = icmp ne i32 %[[TMP7:.+]], 0
// CHECK-NEXT: %[[TMP9:.+]] = zext i1 %[[TMP8:.+]] to i32
// CHECK-NEXT: %[[OMP_FLOOR0_TRIPCOUNT:.+]] = add nuw i32 %[[TMP6:.+]], %[[TMP9:.+]]
// CHECK-NEXT: br label %[[OMP_FLOOR0_PREHEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_PREHEADER]]:
// CHECK-NEXT: %[[TMP10:.+]] = udiv i32 %[[OMP_FLOOR0_TRIPCOUNT:.+]], %[[TMP3:.+]]
// CHECK-NEXT: %[[TMP11:.+]] = urem i32 %[[OMP_FLOOR0_TRIPCOUNT:.+]], %[[TMP3:.+]]
// CHECK-NEXT: %[[TMP12:.+]] = icmp ne i32 %[[TMP11:.+]], 0
// CHECK-NEXT: %[[TMP13:.+]] = zext i1 %[[TMP12:.+]] to i32
// CHECK-NEXT: %[[OMP_FLOOR0_TRIPCOUNT1:.+]] = add nuw i32 %[[TMP10:.+]], %[[TMP13:.+]]
// CHECK-NEXT: br label %[[OMP_FLOOR0_PREHEADER2:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_PREHEADER2]]:
// CHECK-NEXT: br label %[[OMP_FLOOR0_HEADER3:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_HEADER3]]:
// CHECK-NEXT: %[[OMP_FLOOR0_IV9:.+]] = phi i32 [ 0, %[[OMP_FLOOR0_PREHEADER2:.+]] ], [ %[[OMP_FLOOR0_NEXT11:.+]], %[[OMP_FLOOR0_INC6:.+]] ]
// CHECK-NEXT: br label %[[OMP_FLOOR0_COND4:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_COND4]]:
// CHECK-NEXT: %[[OMP_FLOOR0_CMP10:.+]] = icmp ult i32 %[[OMP_FLOOR0_IV9:.+]], %[[OMP_FLOOR0_TRIPCOUNT1:.+]]
// CHECK-NEXT: br i1 %[[OMP_FLOOR0_CMP10:.+]], label %[[OMP_FLOOR0_BODY5:.+]], label %[[OMP_FLOOR0_EXIT7:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_BODY5]]:
// CHECK-NEXT: %[[TMP14:.+]] = icmp eq i32 %[[OMP_FLOOR0_IV9:.+]], %[[TMP10:.+]]
// CHECK-NEXT: %[[TMP15:.+]] = select i1 %[[TMP14:.+]], i32 %[[TMP11:.+]], i32 %[[TMP3:.+]]
// CHECK-NEXT: br label %[[OMP_TILE0_PREHEADER12:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_PREHEADER12]]:
// CHECK-NEXT: br label %[[OMP_TILE0_HEADER13:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_HEADER13]]:
// CHECK-NEXT: %[[OMP_TILE0_IV19:.+]] = phi i32 [ 0, %[[OMP_TILE0_PREHEADER12:.+]] ], [ %[[OMP_TILE0_NEXT21:.+]], %[[OMP_TILE0_INC16:.+]] ]
// CHECK-NEXT: br label %[[OMP_TILE0_COND14:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_COND14]]:
// CHECK-NEXT: %[[OMP_TILE0_CMP20:.+]] = icmp ult i32 %[[OMP_TILE0_IV19:.+]], %[[TMP15:.+]]
// CHECK-NEXT: br i1 %[[OMP_TILE0_CMP20:.+]], label %[[OMP_TILE0_BODY15:.+]], label %[[OMP_TILE0_EXIT17:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_BODY15]]:
// CHECK-NEXT: %[[TMP16:.+]] = mul nuw i32 %[[TMP3:.+]], %[[OMP_FLOOR0_IV9:.+]]
// CHECK-NEXT: %[[TMP17:.+]] = add nuw i32 %[[TMP16:.+]], %[[OMP_TILE0_IV19:.+]]
// CHECK-NEXT: br label %[[OMP_FLOOR0_BODY:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_BODY]]:
// CHECK-NEXT: %[[TMP18:.+]] = icmp eq i32 %[[TMP17:.+]], %[[TMP6:.+]]
// CHECK-NEXT: %[[TMP19:.+]] = select i1 %[[TMP18:.+]], i32 %[[TMP7:.+]], i32 %[[TMP2:.+]]
// CHECK-NEXT: br label %[[OMP_TILE0_PREHEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_PREHEADER]]:
// CHECK-NEXT: %[[TMP20:.+]] = udiv i32 %[[TMP19:.+]], %[[TMP4:.+]]
// CHECK-NEXT: %[[TMP21:.+]] = urem i32 %[[TMP19:.+]], %[[TMP4:.+]]
// CHECK-NEXT: %[[TMP22:.+]] = icmp ne i32 %[[TMP21:.+]], 0
// CHECK-NEXT: %[[TMP23:.+]] = zext i1 %[[TMP22:.+]] to i32
// CHECK-NEXT: %[[OMP_FLOOR0_TRIPCOUNT22:.+]] = add nuw i32 %[[TMP20:.+]], %[[TMP23:.+]]
// CHECK-NEXT: br label %[[OMP_FLOOR0_PREHEADER23:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_PREHEADER23]]:
// CHECK-NEXT: br label %[[OMP_FLOOR0_HEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_HEADER]]:
// CHECK-NEXT: %[[OMP_FLOOR0_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR0_PREHEADER23:.+]] ], [ %[[OMP_FLOOR0_NEXT:.+]], %[[OMP_FLOOR0_INC:.+]] ]
// CHECK-NEXT: br label %[[OMP_FLOOR0_COND:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_COND]]:
// CHECK-NEXT: %[[OMP_FLOOR0_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR0_IV:.+]], %[[OMP_FLOOR0_TRIPCOUNT22:.+]]
// CHECK-NEXT: br i1 %[[OMP_FLOOR0_CMP:.+]], label %[[OMP_FLOOR0_BODY24:.+]], label %[[OMP_FLOOR0_EXIT:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_BODY24]]:
// CHECK-NEXT: %[[TMP24:.+]] = icmp eq i32 %[[OMP_FLOOR0_IV:.+]], %[[TMP20:.+]]
// CHECK-NEXT: %[[TMP25:.+]] = select i1 %[[TMP24:.+]], i32 %[[TMP21:.+]], i32 %[[TMP4:.+]]
// CHECK-NEXT: br label %[[OMP_TILE0_PREHEADER26:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_PREHEADER26]]:
// CHECK-NEXT: br label %[[OMP_TILE0_HEADER27:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_HEADER27]]:
// CHECK-NEXT: %[[OMP_TILE0_IV33:.+]] = phi i32 [ 0, %[[OMP_TILE0_PREHEADER26:.+]] ], [ %[[OMP_TILE0_NEXT35:.+]], %[[OMP_TILE0_INC30:.+]] ]
// CHECK-NEXT: br label %[[OMP_TILE0_COND28:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_COND28]]:
// CHECK-NEXT: %[[OMP_TILE0_CMP34:.+]] = icmp ult i32 %[[OMP_TILE0_IV33:.+]], %[[TMP25:.+]]
// CHECK-NEXT: br i1 %[[OMP_TILE0_CMP34:.+]], label %[[OMP_TILE0_BODY29:.+]], label %[[OMP_TILE0_EXIT31:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_BODY29]]:
// CHECK-NEXT: %[[TMP26:.+]] = mul nuw i32 %[[TMP4:.+]], %[[OMP_FLOOR0_IV:.+]]
// CHECK-NEXT: %[[TMP27:.+]] = add nuw i32 %[[TMP26:.+]], %[[OMP_TILE0_IV33:.+]]
// CHECK-NEXT: br label %[[OMP_TILE0_BODY:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_BODY]]:
// CHECK-NEXT: %[[TMP28:.+]] = mul nuw i32 %[[TMP2:.+]], %[[TMP17:.+]]
// CHECK-NEXT: %[[TMP29:.+]] = add nuw i32 %[[TMP28:.+]], %[[TMP27:.+]]
// CHECK-NEXT: br label %[[OMP_OMP_LOOP_BODY:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_OMP_LOOP_BODY]]:
// CHECK-NEXT: br label %[[OMP_LOOP_REGION:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_LOOP_REGION]]:
// CHECK-NEXT: %[[TMP30:.+]] = getelementptr inbounds float, ptr %[[TMP0:.+]], i32 %[[TMP29:.+]]
// CHECK-NEXT: store float 4.200000e+01, ptr %[[TMP30:.+]], align 4
// CHECK-NEXT: br label %[[OMP_REGION_CONT:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_REGION_CONT]]:
// CHECK-NEXT: br label %[[OMP_TILE0_INC30:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_INC30]]:
// CHECK-NEXT: %[[OMP_TILE0_NEXT35:.+]] = add nuw i32 %[[OMP_TILE0_IV33:.+]], 1
// CHECK-NEXT: br label %[[OMP_TILE0_HEADER27:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_EXIT31]]:
// CHECK-NEXT: br label %[[OMP_TILE0_AFTER32:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_AFTER32]]:
// CHECK-NEXT: br label %[[OMP_FLOOR0_INC:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_INC]]:
// CHECK-NEXT: %[[OMP_FLOOR0_NEXT:.+]] = add nuw i32 %[[OMP_FLOOR0_IV:.+]], 1
// CHECK-NEXT: br label %[[OMP_FLOOR0_HEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_EXIT]]:
// CHECK-NEXT: br label %[[OMP_FLOOR0_AFTER25:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_AFTER25]]:
// CHECK-NEXT: br label %[[OMP_TILE0_AFTER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_AFTER]]:
// CHECK-NEXT: br label %[[OMP_TILE0_INC16:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_INC16]]:
// CHECK-NEXT: %[[OMP_TILE0_NEXT21:.+]] = add nuw i32 %[[OMP_TILE0_IV19:.+]], 1
// CHECK-NEXT: br label %[[OMP_TILE0_HEADER13:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_EXIT17]]:
// CHECK-NEXT: br label %[[OMP_TILE0_AFTER18:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_AFTER18]]:
// CHECK-NEXT: br label %[[OMP_FLOOR0_INC6:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_INC6]]:
// CHECK-NEXT: %[[OMP_FLOOR0_NEXT11:.+]] = add nuw i32 %[[OMP_FLOOR0_IV9:.+]], 1
// CHECK-NEXT: br label %[[OMP_FLOOR0_HEADER3:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_EXIT7]]:
// CHECK-NEXT: br label %[[OMP_FLOOR0_AFTER8:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_AFTER8]]:
// CHECK-NEXT: br label %[[OMP_FLOOR0_AFTER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_AFTER]]:
// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER]]:
// CHECK-NEXT: ret void
// CHECK-NEXT: }
// CHECK: !llvm.module.flags = !{!0}
// CHECK-EMPTY:
// CHECK-NEXT: !0 = !{i32 2, !"Debug Info Version", i32 3}