From 21f439f13250bd9b7c19c8dd838177a04bf091ef Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 20 Mar 2026 10:21:15 +0000 Subject: [PATCH] [LoopRotate] Use SCEV exit counts to improve rotation profitability (#187483) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most loop transformations, like unrolling and vectorization, expect the latch branch to be countable. Allow rotation, if it turns the latch from uncountable to countable. This use SCEV to check for countable exits, if CheckExitCount set. Currently it is not set for the LPM1 run (where SCEV is not used by other passes), only in LPM. With that compile-time impact is mostly neutral https://llvm-compile-time-tracker.com/compare.php?from=eba342d0ba930a404a026c80aada51c43974f0db&to=2e676337b45fae63ce9498116d8e6e43772363c5&stat=instructions:u ClamAV is consistently slower (~+0.15%) and 7zip faster in most cases (~-0.13%) Across a large test set based on C/C++ workloads, this rotates ~0.8% more loops with ~2.68M rotated loops. For the test set, ~2.7% more loops are runtime-unrolled and +6.36% more early exit loops vectorized on ARM64 macOS. This fixes a regression where std::ranges::find_last loops stopped being runtime-unrolled after https://github.com/llvm/llvm-project/commit/5f648c370edf5d71c471ffbabdaaa821ad05fb4b which changed the loop structure so we stopped rotating. https://clang.godbolt.org/z/6baeE1av6 Based on https://github.com/llvm/llvm-project/pull/162654. Co-authored-by: Marek Sedláček PR: https://github.com/llvm/llvm-project/pull/187483 --- .../llvm/Transforms/Scalar/LoopRotation.h | 3 +- .../llvm/Transforms/Utils/LoopRotationUtils.h | 3 +- llvm/lib/Passes/PassBuilder.cpp | 16 +- llvm/lib/Passes/PassBuilderPipelines.cpp | 2 +- llvm/lib/Passes/PassRegistry.def | 8 +- llvm/lib/Transforms/Scalar/LoopRotation.cpp | 18 +- .../Transforms/Utils/LoopRotationUtils.cpp | 29 +++- llvm/test/Other/new-pm-print-pipeline.ll | 6 +- .../Transforms/LoopRotate/rotate-exitcount.ll | 155 +++++++++++++++++ ...e-to-enable-unrolling-and-vectorization.ll | 164 ++++++++++++++++++ 10 files changed, 376 insertions(+), 28 deletions(-) create mode 100644 llvm/test/Transforms/LoopRotate/rotate-exitcount.ll create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/loop-rotate-to-enable-unrolling-and-vectorization.ll diff --git a/llvm/include/llvm/Transforms/Scalar/LoopRotation.h b/llvm/include/llvm/Transforms/Scalar/LoopRotation.h index cd108f7383e4..256345b26d57 100644 --- a/llvm/include/llvm/Transforms/Scalar/LoopRotation.h +++ b/llvm/include/llvm/Transforms/Scalar/LoopRotation.h @@ -24,7 +24,7 @@ class Loop; class LoopRotatePass : public PassInfoMixin { public: LoopRotatePass(bool EnableHeaderDuplication = true, - bool PrepareForLTO = false); + bool PrepareForLTO = false, bool CheckExitCount = false); PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U); @@ -34,6 +34,7 @@ public: private: const bool EnableHeaderDuplication; const bool PrepareForLTO; + const bool CheckExitCount; }; } diff --git a/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h b/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h index c3643e0f27f9..18a1c4efcaab 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h @@ -37,7 +37,8 @@ LLVM_ABI bool LoopRotation(Loop *L, LoopInfo *LI, DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, const SimplifyQuery &SQ, bool RotationOnly, unsigned Threshold, - bool IsUtilMode, bool PrepareForLTO = false); + bool IsUtilMode, bool PrepareForLTO = false, + bool CheckExitCount = false); } // namespace llvm diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index ea2380448c06..a23d64b491a7 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -1274,17 +1274,25 @@ Expected parseLICMOptions(StringRef Params) { return Result; } -Expected> parseLoopRotateOptions(StringRef Params) { - std::pair Result = {true, false}; +struct LoopRotateOptions { + bool EnableHeaderDuplication = true; + bool PrepareForLTO = false; + bool CheckExitCount = false; +}; + +Expected parseLoopRotateOptions(StringRef Params) { + LoopRotateOptions Result; while (!Params.empty()) { StringRef ParamName; std::tie(ParamName, Params) = Params.split(';'); bool Enable = !ParamName.consume_front("no-"); if (ParamName == "header-duplication") { - Result.first = Enable; + Result.EnableHeaderDuplication = Enable; } else if (ParamName == "prepare-for-lto") { - Result.second = Enable; + Result.PrepareForLTO = Enable; + } else if (ParamName == "check-exit-count") { + Result.CheckExitCount = Enable; } else { return make_error( formatv("invalid LoopRotate pass parameter '{}'", ParamName).str(), diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 123541a98545..5e4688f4dd7e 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1579,7 +1579,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, // Disable header duplication at -Oz. LPM.addPass(LoopRotatePass(EnableLoopHeaderDuplication || Level != OptimizationLevel::Oz, - LTOPreLink)); + LTOPreLink, /*CheckExitCount=*/true)); // Some loops may have become dead by now. Try to delete them. // FIXME: see discussion in https://reviews.llvm.org/D112851, // this may need to be revisited once we run GVN before loop deletion diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 99e0c6bc9121..c92d93d7ae39 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -800,12 +800,14 @@ LOOP_PASS_WITH_PARAMS( parseLICMOptions, "allowspeculation;no-allowspeculation") LOOP_PASS_WITH_PARAMS( "loop-rotate", "LoopRotatePass", - [](std::pair Params) { - return LoopRotatePass(Params.first, Params.second); + [](LoopRotateOptions Params) { + return LoopRotatePass(Params.EnableHeaderDuplication, Params.PrepareForLTO, + Params.CheckExitCount); }, parseLoopRotateOptions, "no-header-duplication;header-duplication;" - "no-prepare-for-lto;prepare-for-lto") + "no-prepare-for-lto;prepare-for-lto;" + "no-check-exit-count;check-exit-count") LOOP_PASS_WITH_PARAMS( "simple-loop-unswitch", "SimpleLoopUnswitchPass", [](std::pair Params) { diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp index 112761f81016..50d44369a40d 100644 --- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -38,9 +38,10 @@ static cl::opt PrepareForLTOOption( cl::desc("Run loop-rotation in the prepare-for-lto stage. This option " "should be used for testing only.")); -LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication, bool PrepareForLTO) +LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication, bool PrepareForLTO, + bool CheckExitCount) : EnableHeaderDuplication(EnableHeaderDuplication), - PrepareForLTO(PrepareForLTO) {} + PrepareForLTO(PrepareForLTO), CheckExitCount(CheckExitCount) {} void LoopRotatePass::printPipeline( raw_ostream &OS, function_ref MapClassName2PassName) { @@ -53,7 +54,11 @@ void LoopRotatePass::printPipeline( if (!PrepareForLTO) OS << "no-"; - OS << "prepare-for-lto"; + OS << "prepare-for-lto;"; + + if (!CheckExitCount) + OS << "no-"; + OS << "check-exit-count"; OS << ">"; } @@ -74,9 +79,10 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM, std::optional MSSAU; if (AR.MSSA) MSSAU = MemorySSAUpdater(AR.MSSA); - bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, - MSSAU ? &*MSSAU : nullptr, SQ, false, Threshold, - false, PrepareForLTO || PrepareForLTOOption); + bool Changed = + LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, + MSSAU ? &*MSSAU : nullptr, SQ, false, Threshold, false, + PrepareForLTO || PrepareForLTOOption, CheckExitCount); if (!Changed) return PreservedAnalyses::all(); diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index bf236d48f58f..c8bc5e4daeff 100644 --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -63,16 +63,18 @@ class LoopRotate { bool RotationOnly; bool IsUtilMode; bool PrepareForLTO; + bool CheckExitCount; public: LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI, const TargetTransformInfo *TTI, AssumptionCache *AC, DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode, - bool PrepareForLTO) + bool PrepareForLTO, bool CheckExitCount) : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE), MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly), - IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO) {} + IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO), + CheckExitCount(CheckExitCount) {} bool processLoop(Loop *L); private: @@ -178,11 +180,12 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, } } -// Assuming both header and latch are exiting, look for a phi which is only -// used outside the loop (via a LCSSA phi) in the exit from the header. -// This means that rotating the loop can remove the phi. -static bool profitableToRotateLoopExitingLatch(Loop *L) { +// Assuming both header and latch are exiting, check if rotating is profitable: +// either a header phi becomes dead, or rotating makes the latch exit count +// computable (enabling downstream optimizations like unrolling/vectorization). +static bool profitableToRotateLoopExitingLatch(Loop *L, ScalarEvolution *SE) { BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); CondBrInst *BI = dyn_cast(Header->getTerminator()); BasicBlock *HeaderExit = BI->getSuccessor(0); if (L->contains(HeaderExit)) @@ -196,6 +199,13 @@ static bool profitableToRotateLoopExitingLatch(Loop *L) { continue; return true; } + + // Check if rotating would make the latch exit count computable, enabling + // optimizations like runtime unrolling and vectorization. + if (SE && isa(SE->getExitCount(L, Latch)) && + !isa(SE->getExitCount(L, Header))) + return true; + return false; } @@ -363,7 +373,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // Rotate if the loop latch was just simplified. Or if it makes the loop exit // count computable. Or if we think it will be profitable. if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false && - !profitableToRotateLoopExitingLatch(L)) + !profitableToRotateLoopExitingLatch(L, CheckExitCount ? SE : nullptr)) return Rotated; // Check size of original header and reject loop if it is very big or we can't @@ -965,8 +975,9 @@ bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, const SimplifyQuery &SQ, bool RotationOnly = true, unsigned Threshold = unsigned(-1), - bool IsUtilMode = true, bool PrepareForLTO) { + bool IsUtilMode = true, bool PrepareForLTO, + bool CheckExitCount) { LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly, - IsUtilMode, PrepareForLTO); + IsUtilMode, PrepareForLTO, CheckExitCount); return LR.processLoop(L); } diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll index 3536932f4432..53a17975c512 100644 --- a/llvm/test/Other/new-pm-print-pipeline.ll +++ b/llvm/test/Other/new-pm-print-pipeline.ll @@ -4,7 +4,7 @@ ; CHECK-0: function(adce),function(adce) ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='module(rpo-function-attrs,require,function(float2int,lower-constant-intrinsics,loop(loop-rotate)),invalidate)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-1 -; CHECK-1: rpo-function-attrs,require,function(float2int,lower-constant-intrinsics,loop(loop-rotate)),invalidate +; CHECK-1: rpo-function-attrs,require,function(float2int,lower-constant-intrinsics,loop(loop-rotate)),invalidate ;; Test that we get ClassName printed when there is no ClassName to pass-name mapping (as is the case for the BitcodeWriterPass). ; RUN: opt -o /dev/null -disable-verify -print-pipeline-passes -passes='function(mem2reg)' < %s -disable-pipeline-verification | FileCheck %s --match-full-lines --check-prefixes=CHECK-3 @@ -66,7 +66,7 @@ ;; Test that the loop-nest-pass lnicm is printed with the other loop-passes in the pipeline. ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-mssa(licm,loop-rotate,loop-deletion,lnicm,loop-rotate))' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-23 -; CHECK-23: function(loop-mssa(licm,loop-rotate,loop-deletion,lnicm,loop-rotate)) +; CHECK-23: function(loop-mssa(licm,loop-rotate,loop-deletion,lnicm,loop-rotate)) ;; Test that -debugify and -check-debugify is printed correctly. ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='debugify,no-op-function,check-debugify' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-24 @@ -110,7 +110,7 @@ ; CHECK-32: cgscc(function(no-op-function)) ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop(loop-rotate))' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-33 -; CHECK-33: function(loop(loop-rotate)) +; CHECK-33: function(loop(loop-rotate)) ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='globaldce' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-34 ; CHECK-34: globaldce diff --git a/llvm/test/Transforms/LoopRotate/rotate-exitcount.ll b/llvm/test/Transforms/LoopRotate/rotate-exitcount.ll new file mode 100644 index 000000000000..e21d3f05fc08 --- /dev/null +++ b/llvm/test/Transforms/LoopRotate/rotate-exitcount.ll @@ -0,0 +1,155 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes='loop(loop-rotate)' %s | FileCheck %s --check-prefixes=CHECK,NO-EXIT-COUNT +; RUN: opt -S -passes='loop(loop-rotate)' %s | FileCheck %s --check-prefixes=CHECK,EXIT-COUNT + +; Computable header exit, data-dependent latch exit. Rotated with check-exit-count. +define ptr @search_loop(ptr %begin, ptr %end, i8 %val) { +; NO-EXIT-COUNT-LABEL: @search_loop( +; NO-EXIT-COUNT-NEXT: entry: +; NO-EXIT-COUNT-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[BEGIN:%.*]], [[END:%.*]] +; NO-EXIT-COUNT-NEXT: br i1 [[CMP_ENTRY]], label [[EXIT:%.*]], label [[FOR_COND_PREHEADER:%.*]] +; NO-EXIT-COUNT: for.cond.preheader: +; NO-EXIT-COUNT-NEXT: br label [[FOR_COND:%.*]] +; NO-EXIT-COUNT: for.cond: +; NO-EXIT-COUNT-NEXT: [[PTR:%.*]] = phi ptr [ [[PTR_DEC:%.*]], [[FOR_BODY:%.*]] ], [ [[END]], [[FOR_COND_PREHEADER]] ] +; NO-EXIT-COUNT-NEXT: [[PTR_DEC]] = getelementptr inbounds i8, ptr [[PTR]], i64 -1 +; NO-EXIT-COUNT-NEXT: [[CMP_END:%.*]] = icmp eq ptr [[PTR_DEC]], [[BEGIN]] +; NO-EXIT-COUNT-NEXT: br i1 [[CMP_END]], label [[NOT_FOUND:%.*]], label [[FOR_BODY]] +; NO-EXIT-COUNT: for.body: +; NO-EXIT-COUNT-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR_DEC]], align 1 +; NO-EXIT-COUNT-NEXT: [[CMP_VAL:%.*]] = icmp eq i8 [[LOAD]], [[VAL:%.*]] +; NO-EXIT-COUNT-NEXT: br i1 [[CMP_VAL]], label [[FOUND:%.*]], label [[FOR_COND]] +; NO-EXIT-COUNT: found: +; NO-EXIT-COUNT-NEXT: [[PTR_DEC_LCSSA1:%.*]] = phi ptr [ [[PTR_DEC]], [[FOR_BODY]] ] +; NO-EXIT-COUNT-NEXT: ret ptr [[PTR_DEC_LCSSA1]] +; NO-EXIT-COUNT: not.found: +; NO-EXIT-COUNT-NEXT: br label [[EXIT]] +; NO-EXIT-COUNT: exit: +; NO-EXIT-COUNT-NEXT: ret ptr [[END]] +; +; EXIT-COUNT-LABEL: @search_loop( +; EXIT-COUNT-NEXT: entry: +; EXIT-COUNT-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[BEGIN:%.*]], [[END:%.*]] +; EXIT-COUNT-NEXT: br i1 [[CMP_ENTRY]], label [[EXIT:%.*]], label [[FOR_COND_PREHEADER:%.*]] +; EXIT-COUNT: for.cond.preheader: +; EXIT-COUNT-NEXT: [[PTR_DEC2:%.*]] = getelementptr inbounds i8, ptr [[END]], i64 -1 +; EXIT-COUNT-NEXT: [[CMP_END3:%.*]] = icmp eq ptr [[PTR_DEC2]], [[BEGIN]] +; EXIT-COUNT-NEXT: br i1 [[CMP_END3]], label [[NOT_FOUND:%.*]], label [[FOR_BODY_LR_PH:%.*]] +; EXIT-COUNT: for.body.lr.ph: +; EXIT-COUNT-NEXT: br label [[FOR_BODY:%.*]] +; EXIT-COUNT: for.cond: +; EXIT-COUNT-NEXT: [[PTR:%.*]] = phi ptr [ [[PTR_DEC4:%.*]], [[FOR_BODY]] ] +; EXIT-COUNT-NEXT: [[PTR_DEC:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 -1 +; EXIT-COUNT-NEXT: [[CMP_END:%.*]] = icmp eq ptr [[PTR_DEC]], [[BEGIN]] +; EXIT-COUNT-NEXT: br i1 [[CMP_END]], label [[FOR_COND_NOT_FOUND_CRIT_EDGE:%.*]], label [[FOR_BODY]] +; EXIT-COUNT: for.body: +; EXIT-COUNT-NEXT: [[PTR_DEC4]] = phi ptr [ [[PTR_DEC2]], [[FOR_BODY_LR_PH]] ], [ [[PTR_DEC]], [[FOR_COND:%.*]] ] +; EXIT-COUNT-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR_DEC4]], align 1 +; EXIT-COUNT-NEXT: [[CMP_VAL:%.*]] = icmp eq i8 [[LOAD]], [[VAL:%.*]] +; EXIT-COUNT-NEXT: br i1 [[CMP_VAL]], label [[FOUND:%.*]], label [[FOR_COND]] +; EXIT-COUNT: found: +; EXIT-COUNT-NEXT: [[PTR_DEC_LCSSA1:%.*]] = phi ptr [ [[PTR_DEC4]], [[FOR_BODY]] ] +; EXIT-COUNT-NEXT: ret ptr [[PTR_DEC_LCSSA1]] +; EXIT-COUNT: for.cond.not.found_crit_edge: +; EXIT-COUNT-NEXT: br label [[NOT_FOUND]] +; EXIT-COUNT: not.found: +; EXIT-COUNT-NEXT: br label [[EXIT]] +; EXIT-COUNT: exit: +; EXIT-COUNT-NEXT: ret ptr [[END]] +; +entry: + %cmp.entry = icmp eq ptr %begin, %end + br i1 %cmp.entry, label %exit, label %for.cond + +for.cond: + %ptr = phi ptr [ %end, %entry ], [ %ptr.dec, %for.body ] + %ptr.dec = getelementptr inbounds i8, ptr %ptr, i64 -1 + %cmp.end = icmp eq ptr %ptr.dec, %begin + br i1 %cmp.end, label %not.found, label %for.body + +for.body: + %load = load i8, ptr %ptr.dec, align 1 + %cmp.val = icmp eq i8 %load, %val + br i1 %cmp.val, label %found, label %for.cond + +found: + ret ptr %ptr.dec + +not.found: + br label %exit + +exit: + ret ptr %end +} + +; Both exits are memory-dependent (not computable). Not rotated. +define i32 @both_mem_dependent(ptr %p, ptr %q, i32 %n) { +; CHECK-LABEL: @both_mem_dependent( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BODY:%.*]] ] +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[LOAD1]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[EXIT1:%.*]], label [[BODY]] +; CHECK: body: +; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[Q:%.*]], align 4 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[LOAD2]], 0 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[CMP2]], label [[EXIT2:%.*]], label [[LOOP]] +; CHECK: exit1: +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], [[LOOP]] ] +; CHECK-NEXT: ret i32 [[IV_LCSSA]] +; CHECK: exit2: +; CHECK-NEXT: [[IV_LCSSA1:%.*]] = phi i32 [ [[IV]], [[BODY]] ] +; CHECK-NEXT: ret i32 [[IV_LCSSA1]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %body ] + %load1 = load i32, ptr %p, align 4 + %cmp1 = icmp eq i32 %load1, 0 + br i1 %cmp1, label %exit1, label %body + +body: + %load2 = load i32, ptr %q, align 4 + %cmp2 = icmp eq i32 %load2, 0 + %iv.next = add i32 %iv, 1 + br i1 %cmp2, label %exit2, label %loop + +exit1: + ret i32 %iv + +exit2: + ret i32 %iv +} + +; Latch is unconditional, already properly structured. +define void @already_rotated(ptr %p, i32 %n) { +; CHECK-LABEL: @already_rotated( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: store i32 [[IV]], ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw i32 [[IV]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + store i32 %iv, ptr %p, align 4 + %iv.next = add nuw i32 %iv, 1 + %cmp = icmp eq i32 %iv.next, %n + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/loop-rotate-to-enable-unrolling-and-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/loop-rotate-to-enable-unrolling-and-vectorization.ll new file mode 100644 index 000000000000..e23d1426d736 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/loop-rotate-to-enable-unrolling-and-vectorization.ll @@ -0,0 +1,164 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -O3 %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; Test that a search loop with computable header exit gets rotated and +; runtime-unrolled at -O3. +define ptr @search_loop_unrolled(ptr %begin, ptr %end, i8 %val) { +; CHECK-LABEL: define ptr @search_loop_unrolled( +; CHECK-SAME: ptr readnone captures(address) [[BEGIN:%.*]], ptr readonly captures(address, ret: address, provenance) [[END:%.*]], i8 [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[BEGIN]], [[END]] +; CHECK-NEXT: [[PTR_DEC1:%.*]] = getelementptr inbounds i8, ptr [[END]], i64 -1 +; CHECK-NEXT: [[CMP_END2:%.*]] = icmp eq ptr [[PTR_DEC1]], [[BEGIN]] +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP_ENTRY]], i1 true, i1 [[CMP_END2]] +; CHECK-NEXT: br i1 [[OR_COND]], label %[[COMMON_RET:.*]], label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: [[END5:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEXT: [[BEGIN6:%.*]] = ptrtoint ptr [[BEGIN]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[BEGIN6]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[END5]] +; CHECK-NEXT: [[TMP2:%.*]] = freeze i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], -1 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP2]], 3 +; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT:.*]], label %[[FOR_BODY_PROL:.*]] +; CHECK: [[FOR_BODY_PROL]]: +; CHECK-NEXT: [[PTR_DEC5:%.*]] = phi ptr [ [[PTR_DEC4:%.*]], %[[FOR_COND_PROL:.*]] ], [ [[PTR_DEC1]], %[[FOR_COND]] ] +; CHECK-NEXT: [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_NEXT:%.*]], %[[FOR_COND_PROL]] ], [ 0, %[[FOR_COND]] ] +; CHECK-NEXT: [[LOAD_PROL:%.*]] = load i8, ptr [[PTR_DEC5]], align 1 +; CHECK-NEXT: [[CMP_VAL_PROL:%.*]] = icmp eq i8 [[LOAD_PROL]], [[VAL]] +; CHECK-NEXT: br i1 [[CMP_VAL_PROL]], label %[[COMMON_RET]], label %[[FOR_COND_PROL]] +; CHECK: [[FOR_COND_PROL]]: +; CHECK-NEXT: [[PTR_DEC4]] = getelementptr inbounds i8, ptr [[PTR_DEC5]], i64 -1 +; CHECK-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1 +; CHECK-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]] +; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT]], label %[[FOR_BODY_PROL]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[FOR_BODY_PROL_LOOPEXIT]]: +; CHECK-NEXT: [[PTR_DEC3_UNR:%.*]] = phi ptr [ [[PTR_DEC1]], %[[FOR_COND]] ], [ [[PTR_DEC4]], %[[FOR_COND_PROL]] ] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 3 +; CHECK-NEXT: br i1 [[TMP4]], label %[[COMMON_RET]], label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND1:.*]]: +; CHECK-NEXT: [[PTR_DEC:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3:%.*]], i64 -1 +; CHECK-NEXT: [[LOAD_1:%.*]] = load i8, ptr [[PTR_DEC]], align 1 +; CHECK-NEXT: [[CMP_VAL_1:%.*]] = icmp eq i8 [[LOAD_1]], [[VAL]] +; CHECK-NEXT: br i1 [[CMP_VAL_1]], label %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT14:.*]], label %[[FOR_COND_1:.*]] +; CHECK: [[FOR_COND_1]]: +; CHECK-NEXT: [[PTR_DEC_1:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -2 +; CHECK-NEXT: [[LOAD_2:%.*]] = load i8, ptr [[PTR_DEC_1]], align 1 +; CHECK-NEXT: [[CMP_VAL_2:%.*]] = icmp eq i8 [[LOAD_2]], [[VAL]] +; CHECK-NEXT: br i1 [[CMP_VAL_2]], label %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT12:.*]], label %[[FOR_COND_2:.*]] +; CHECK: [[FOR_COND_2]]: +; CHECK-NEXT: [[PTR_DEC_2:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -3 +; CHECK-NEXT: [[LOAD_3:%.*]] = load i8, ptr [[PTR_DEC_2]], align 1 +; CHECK-NEXT: [[CMP_VAL_3:%.*]] = icmp eq i8 [[LOAD_3]], [[VAL]] +; CHECK-NEXT: br i1 [[CMP_VAL_3]], label %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT10:.*]], label %[[FOR_COND_3:.*]] +; CHECK: [[FOR_COND_3]]: +; CHECK-NEXT: [[PTR_DEC_3:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -4 +; CHECK-NEXT: [[CMP_END:%.*]] = icmp eq ptr [[PTR_DEC_3]], [[BEGIN]] +; CHECK-NEXT: br i1 [[CMP_END]], label %[[COMMON_RET]], label %[[FOR_BODY]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[PTR_DEC3]] = phi ptr [ [[PTR_DEC_3]], %[[FOR_COND_3]] ], [ [[PTR_DEC3_UNR]], %[[FOR_BODY_PROL_LOOPEXIT]] ] +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR_DEC3]], align 1 +; CHECK-NEXT: [[CMP_VAL:%.*]] = icmp eq i8 [[LOAD]], [[VAL]] +; CHECK-NEXT: br i1 [[CMP_VAL]], label %[[COMMON_RET]], label %[[FOR_COND1]] +; CHECK: [[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT10]]: +; CHECK-NEXT: [[PTR_DEC_2_LE:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -3 +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT12]]: +; CHECK-NEXT: [[PTR_DEC_1_LE:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -2 +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT14]]: +; CHECK-NEXT: [[PTR_DEC_LE:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -1 +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi ptr [ [[END]], %[[ENTRY]] ], [ [[END]], %[[FOR_BODY_PROL_LOOPEXIT]] ], [ [[PTR_DEC3]], %[[FOR_BODY]] ], [ [[PTR_DEC_LE]], %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT14]] ], [ [[PTR_DEC_2_LE]], %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT10]] ], [ [[PTR_DEC_1_LE]], %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT12]] ], [ [[END]], %[[FOR_COND_3]] ], [ [[PTR_DEC5]], %[[FOR_BODY_PROL]] ] +; CHECK-NEXT: ret ptr [[COMMON_RET_OP]] +; +entry: + %cmp.entry = icmp eq ptr %begin, %end + br i1 %cmp.entry, label %exit, label %for.cond + +for.cond: + %ptr = phi ptr [ %end, %entry ], [ %ptr.dec, %for.body ] + %ptr.dec = getelementptr inbounds i8, ptr %ptr, i64 -1 + %cmp.end = icmp eq ptr %ptr.dec, %begin + br i1 %cmp.end, label %not.found, label %for.body + +for.body: + %load = load i8, ptr %ptr.dec, align 1 + %cmp.val = icmp eq i8 %load, %val + br i1 %cmp.val, label %found, label %for.cond + +found: + ret ptr %ptr.dec + +not.found: + br label %exit + +exit: + ret ptr %end +} + +define i64 @rotate_needed_to_vectorize(ptr noalias %scan, ptr noalias %match) { +; CHECK-LABEL: define i64 @rotate_needed_to_vectorize( +; CHECK-SAME: ptr noalias readonly captures(none) [[SCAN:%.*]], ptr noalias readonly captures(none) [[MATCH:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[HEADER:.*]] +; CHECK: [[HEADER]]: +; CHECK-NEXT: [[LEN:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[EXIT:.*]] ] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr [[SCAN]], i64 [[LEN]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[GEP1]], align 1 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds nuw i8, ptr [[MATCH]], i64 [[LEN]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[GEP2]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze <16 x i1> [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i16 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[DOTNOT]], label %[[EXIT]], label %[[VECTOR_EARLY_EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[LEN]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP5]], label %[[EXIT1:.*]], label %[[HEADER]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: [[VECTOR_EARLY_EXIT]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> [[TMP3]], i1 false) +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[LEN]], [[TMP6]] +; CHECK-NEXT: br label %[[EXIT1]] +; CHECK: [[EXIT1]]: +; CHECK-NEXT: [[LEN_LCSSA:%.*]] = phi i64 [ [[TMP7]], %[[VECTOR_EARLY_EXIT]] ], [ 1024, %[[EXIT]] ] +; CHECK-NEXT: ret i64 [[LEN_LCSSA]] +; +entry: + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %scan, i64 1024) ] + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %match, i64 1024) ] + br label %header + +header: + %len = phi i64 [ 0, %entry ], [ %len.next, %body ] + %cmp.limit = icmp eq i64 %len, 1024 + br i1 %cmp.limit, label %exit, label %body + +body: + %gep1 = getelementptr inbounds i8, ptr %scan, i64 %len + %v1 = load i8, ptr %gep1, align 1 + %gep2 = getelementptr inbounds i8, ptr %match, i64 %len + %v2 = load i8, ptr %gep2, align 1 + %cmp.data = icmp eq i8 %v1, %v2 + %len.next = add nuw i64 %len, 1 + br i1 %cmp.data, label %header, label %mismatch + +mismatch: + br label %exit + +exit: + %result = phi i64 [ 1024, %header ], [ %len, %mismatch ] + ret i64 %result +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"} +; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]} +; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"} +;.