llvm-project/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch3.ll
Teresa Johnson dfb40d3fd7 [SimpleLoopUnswitch] Skip non-trivial unswitching of cold loop nests
This fixes a compile time issue due to guarding loop unswitching based
on whether the enclosing function is cold. That approach is very
inefficient in the case of large cold functions that contain numerous
loops, since the loop pass calls isFunctionColdInCallGraph once per
loop, and that function walks all BBs in the function (twice for Sample
PGO) looking for any non-cold blocks.

Originally, this code only checked if the current Loop's header was cold
(D129599). However, that apparently caused a slowdown on a SPEC
benchmark, and the example given was that of a cold inner loop nested in
a non-cold outer loop (see comments in D129599). The fix was to check if
the whole function is cold, done in D133275.

This is overkill, and we can simply check if the header of any loop in
the current loop's loop nest is non-cold (looking at both outer and
inner loops). This patch drops the compile time for a large module by
40% with this approach.

I also updated PGO-nontrivial-unswitch2.ll since it only had one cold
loop in a non-cold function, so that it instead had IR based off the
example given in the comments relating to the SPEC degradation in
D129599. I confirmed that the new version of the test fails with the
original check done in D129599 of only the current loop's header
coldness.

Similarly updated test PGO-nontrivial-unswitch.ll to contain a cold loop
in a cold loop nest, and created PGO-nontrivial-unswitch3.ll to contain
a non-cold loop in a non-cold loop nest.

Differential Revision: https://reviews.llvm.org/D146383
2023-03-20 10:14:50 -07:00

185 lines
9.6 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
; RUN: opt < %s -passes='require<profile-summary>,function(loop-mssa(simple-loop-unswitch<nontrivial>))' -S | FileCheck %s
;; Check that non-trivial loop unswitching is applied to a non-cold loop in a
;; non-cold loop nest.
;; IR was generated from the following loop nest, profiled when called
;; with M=1000 and N=10.
;; void hotFunction(bool cond, int M, int N, int * A, int *B, int *C) {
;; for (unsigned j = 0; j < M; j++)
;; for (unsigned i=0; i < N; i++) {
;; A[i] = B[i] + C[i];
;; if (cond) do_something();
;; }
;; }
define void @_Z11hotFunctionbiiPiS_S_(i1 %cond, i32 %M, i32 %N, ptr %A, ptr %B, ptr %C) !prof !36 {
; CHECK-LABEL: define void @_Z11hotFunctionbiiPiS_S_
; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) !prof [[PROF18:![0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP19_NOT:%.*]] = icmp eq i32 [[M]], 0
; CHECK-NEXT: br i1 [[CMP19_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], !prof [[PROF19:![0-9]+]]
; CHECK: for.cond1.preheader.lr.ph:
; CHECK-NEXT: [[CMP217_NOT:%.*]] = icmp eq i32 [[N]], 0
; CHECK-NEXT: br i1 [[CMP217_NOT]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT:%.*]], !prof [[PROF20:![0-9]+]]
; CHECK: for.cond1.preheader.lr.ph.split.us:
; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]]
; CHECK: for.cond1.preheader.us:
; CHECK-NEXT: [[J_020_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH_SPLIT_US]] ], [ [[INC10_US:%.*]], [[FOR_COND_CLEANUP3_US:%.*]] ]
; CHECK-NEXT: br label [[FOR_COND_CLEANUP3_US]]
; CHECK: for.cond.cleanup3.us:
; CHECK-NEXT: [[INC10_US]] = add nuw i32 [[J_020_US]], 1
; CHECK-NEXT: [[EXITCOND22_NOT_US:%.*]] = icmp eq i32 [[INC10_US]], [[M]]
; CHECK-NEXT: br i1 [[EXITCOND22_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_US]], !prof [[PROF19]]
; CHECK: for.cond.cleanup.loopexit.split.us:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
; CHECK: for.cond1.preheader.lr.ph.split:
; CHECK-NEXT: br i1 [[COND]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT_SPLIT:%.*]]
; CHECK: for.cond1.preheader.lr.ph.split.split.us:
; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US1:%.*]]
; CHECK: for.cond1.preheader.us1:
; CHECK-NEXT: [[J_020_US2:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH_SPLIT_SPLIT_US]] ], [ [[INC10_US4:%.*]], [[FOR_COND_CLEANUP3_US3:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY4_PREHEADER_US:%.*]]
; CHECK: for.cond.cleanup3.us3:
; CHECK-NEXT: [[INC10_US4]] = add nuw i32 [[J_020_US2]], 1
; CHECK-NEXT: [[EXITCOND22_NOT_US5:%.*]] = icmp eq i32 [[INC10_US4]], [[M]]
; CHECK-NEXT: br i1 [[EXITCOND22_NOT_US5]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_US1]], !prof [[PROF19]]
; CHECK: for.body4.preheader.us:
; CHECK-NEXT: br label [[FOR_BODY4_PREHEADER_SPLIT_US_US:%.*]]
; CHECK: for.cond.cleanup3.loopexit.us:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP3_US3]]
; CHECK: for.body4.preheader.split.us.us:
; CHECK-NEXT: br label [[FOR_BODY4_US_US:%.*]]
; CHECK: for.body4.us.us:
; CHECK-NEXT: [[INDVARS_IV_US_US:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_US_US:%.*]], [[FOR_INC_US_US:%.*]] ], [ 0, [[FOR_BODY4_PREHEADER_SPLIT_US_US]] ]
; CHECK-NEXT: [[ARRAYIDX_US_US:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV_US_US]]
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX_US_US]], align 4
; CHECK-NEXT: [[ARRAYIDX6_US_US:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV_US_US]]
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX6_US_US]], align 4
; CHECK-NEXT: [[ADD_US_US:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
; CHECK-NEXT: [[ARRAYIDX8_US_US:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV_US_US]]
; CHECK-NEXT: store i32 [[ADD_US_US]], ptr [[ARRAYIDX8_US_US]], align 4
; CHECK-NEXT: br label [[IF_THEN_US_US:%.*]]
; CHECK: if.then.us.us:
; CHECK-NEXT: tail call void @_Z12do_somethingv()
; CHECK-NEXT: br label [[FOR_INC_US_US]]
; CHECK: for.inc.us.us:
; CHECK-NEXT: [[WIDE_TRIP_COUNT_US_US:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT: [[INDVARS_IV_NEXT_US_US]] = add nuw nsw i64 [[INDVARS_IV_US_US]], 1
; CHECK-NEXT: [[EXITCOND_NOT_US_US:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_US_US]], [[WIDE_TRIP_COUNT_US_US]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT_US_US]], label [[FOR_COND_CLEANUP3_LOOPEXIT_SPLIT_US_US:%.*]], label [[FOR_BODY4_US_US]], !prof [[PROF20]]
; CHECK: for.cond.cleanup3.loopexit.split.us.us:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP3_LOOPEXIT_US:%.*]]
; CHECK: for.cond.cleanup.loopexit.split.split.us:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT:%.*]]
; CHECK: for.cond1.preheader.lr.ph.split.split:
; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
; CHECK: for.cond1.preheader:
; CHECK-NEXT: [[J_020:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH_SPLIT_SPLIT]] ], [ [[INC10:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY4_PREHEADER:%.*]]
; CHECK: for.body4.preheader:
; CHECK-NEXT: br label [[FOR_BODY4_PREHEADER_SPLIT:%.*]]
; CHECK: for.body4.preheader.split:
; CHECK-NEXT: br label [[FOR_BODY4:%.*]]
; CHECK: for.cond.cleanup.loopexit.split.split:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT]]
; CHECK: for.cond.cleanup.loopexit.split:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.cond.cleanup3.loopexit.split:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP3_LOOPEXIT:%.*]]
; CHECK: for.cond.cleanup3.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP3]]
; CHECK: for.cond.cleanup3:
; CHECK-NEXT: [[INC10]] = add nuw i32 [[J_020]], 1
; CHECK-NEXT: [[EXITCOND22_NOT:%.*]] = icmp eq i32 [[INC10]], [[M]]
; CHECK-NEXT: br i1 [[EXITCOND22_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_SPLIT:%.*]], label [[FOR_COND1_PREHEADER]], !prof [[PROF19]]
; CHECK: for.body4:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY4_PREHEADER_SPLIT]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], [[TMP2]]
; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX8]], align 4
; CHECK-NEXT: br label [[FOR_INC]]
; CHECK: for.inc:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3_LOOPEXIT_SPLIT:%.*]], label [[FOR_BODY4]], !prof [[PROF20]]
;
entry:
%cmp19.not = icmp eq i32 %M, 0
br i1 %cmp19.not, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph, !prof !37
for.cond1.preheader.lr.ph:
%cmp217.not = icmp eq i32 %N, 0
br label %for.cond1.preheader
for.cond1.preheader:
%j.020 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc10, %for.cond.cleanup3 ]
br i1 %cmp217.not, label %for.cond.cleanup3, label %for.body4, !prof !38
for.cond.cleanup:
ret void
for.cond.cleanup3:
%inc10 = add nuw i32 %j.020, 1
%exitcond22.not = icmp eq i32 %inc10, %M
br i1 %exitcond22.not, label %for.cond.cleanup, label %for.cond1.preheader, !prof !37
for.body4:
%indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.cond1.preheader ]
%arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx6 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv
%1 = load i32, ptr %arrayidx6, align 4
%add = add nsw i32 %1, %0
%arrayidx8 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
store i32 %add, ptr %arrayidx8, align 4
br i1 %cond, label %if.then, label %for.inc
if.then:
tail call void @_Z12do_somethingv()
br label %for.inc
for.inc:
%wide.trip.count = zext i32 %N to i64
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4, !prof !38
}
declare void @_Z12do_somethingv()
!llvm.module.flags = !{!6}
!6 = !{i32 1, !"ProfileSummary", !7}
!7 = !{!8, !9, !10, !11, !12, !13, !14, !15, !16, !17}
!8 = !{!"ProfileFormat", !"InstrProf"}
!9 = !{!"TotalCount", i64 1002}
!10 = !{!"MaxCount", i64 1000}
!11 = !{!"MaxInternalCount", i64 1000}
!12 = !{!"MaxFunctionCount", i64 1}
!13 = !{!"NumCounts", i64 6}
!14 = !{!"NumFunctions", i64 3}
!15 = !{!"IsPartialProfile", i64 0}
!16 = !{!"PartialProfileRatio", double 0.000000e+00}
!17 = !{!"DetailedSummary", !18}
!18 = !{!19, !29, !30, !32, !34}
!19 = !{i32 10000, i64 10000, i32 3}
!29 = !{i32 950000, i64 10000, i32 3}
!30 = !{i32 990000, i64 1000, i32 4}
!32 = !{i32 999900, i64 1000, i32 4}
!34 = !{i32 999999, i64 1, i32 6}
!36 = !{!"function_entry_count", i64 1}
!37 = !{!"branch_weights", i32 1, i32 1000}
!38 = !{!"branch_weights", i32 1000, i32 10000}