Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
118 lines
5.0 KiB
LLVM
118 lines
5.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -simplifycfg -phi-node-folding-threshold=2 -S | FileCheck %s
|
|
|
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
|
target triple = "x86_64-unknown-linux-gnu"
|
|
|
|
define i32 @test1(i32 %a, i32 %b, i32 %c) nounwind {
|
|
; CHECK-LABEL: @test1(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[B:%.*]], 0
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[C:%.*]], 1
|
|
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[A:%.*]], 1
|
|
; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP2]], i32 [[TMP3]], i32 [[A]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP1]], i32 [[SPEC_SELECT]], i32 [[B]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = sub i32 [[TMP4]], 1
|
|
; CHECK-NEXT: ret i32 [[TMP5]]
|
|
;
|
|
entry:
|
|
%tmp1 = icmp eq i32 %b, 0
|
|
br i1 %tmp1, label %bb1, label %bb3
|
|
|
|
bb1: ; preds = %entry
|
|
%tmp2 = icmp sgt i32 %c, 1
|
|
br i1 %tmp2, label %bb2, label %bb3
|
|
|
|
bb2: ; preds = bb1
|
|
%tmp3 = add i32 %a, 1
|
|
br label %bb3
|
|
|
|
bb3: ; preds = %bb2, %entry
|
|
%tmp4 = phi i32 [ %b, %entry ], [ %a, %bb1 ], [ %tmp3, %bb2 ]
|
|
%tmp5 = sub i32 %tmp4, 1
|
|
ret i32 %tmp5
|
|
}
|
|
|
|
define i8* @test4(i1* %dummy, i8* %a, i8* %b) {
|
|
; Test that we don't speculate an arbitrarily large number of unfolded constant
|
|
; expressions.
|
|
; CHECK-LABEL: @test4(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[COND1:%.*]] = load volatile i1, i1* [[DUMMY:%.*]]
|
|
; CHECK-NEXT: br i1 [[COND1]], label [[IF:%.*]], label [[END:%.*]]
|
|
; CHECK: if:
|
|
; CHECK-NEXT: [[COND2:%.*]] = load volatile i1, i1* [[DUMMY]]
|
|
; CHECK-NEXT: br i1 [[COND2]], label [[THEN:%.*]], label [[END]]
|
|
; CHECK: then:
|
|
; CHECK-NEXT: br label [[END]]
|
|
; CHECK: end:
|
|
; CHECK-NEXT: [[X1:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY:%.*]] ], [ [[B:%.*]], [[IF]] ], [ inttoptr (i64 1 to i8*), [[THEN]] ]
|
|
; CHECK-NEXT: [[X2:%.*]] = phi i8* [ [[A]], [[ENTRY]] ], [ [[B]], [[IF]] ], [ inttoptr (i64 2 to i8*), [[THEN]] ]
|
|
; CHECK-NEXT: [[X3:%.*]] = phi i8* [ [[A]], [[ENTRY]] ], [ [[B]], [[IF]] ], [ inttoptr (i64 3 to i8*), [[THEN]] ]
|
|
; CHECK-NEXT: [[X4:%.*]] = phi i8* [ [[A]], [[ENTRY]] ], [ [[B]], [[IF]] ], [ inttoptr (i64 4 to i8*), [[THEN]] ]
|
|
; CHECK-NEXT: [[X5:%.*]] = phi i8* [ [[A]], [[ENTRY]] ], [ [[B]], [[IF]] ], [ inttoptr (i64 5 to i8*), [[THEN]] ]
|
|
; CHECK-NEXT: [[X6:%.*]] = phi i8* [ [[A]], [[ENTRY]] ], [ [[B]], [[IF]] ], [ inttoptr (i64 6 to i8*), [[THEN]] ]
|
|
; CHECK-NEXT: [[X7:%.*]] = phi i8* [ [[A]], [[ENTRY]] ], [ [[B]], [[IF]] ], [ inttoptr (i64 7 to i8*), [[THEN]] ]
|
|
; CHECK-NEXT: [[X8:%.*]] = phi i8* [ [[A]], [[ENTRY]] ], [ [[B]], [[IF]] ], [ inttoptr (i64 8 to i8*), [[THEN]] ]
|
|
; CHECK-NEXT: [[X9:%.*]] = phi i8* [ [[A]], [[ENTRY]] ], [ [[B]], [[IF]] ], [ inttoptr (i64 9 to i8*), [[THEN]] ]
|
|
; CHECK-NEXT: [[X10:%.*]] = phi i8* [ [[A]], [[ENTRY]] ], [ [[B]], [[IF]] ], [ inttoptr (i64 10 to i8*), [[THEN]] ]
|
|
; CHECK-NEXT: ret i8* [[X10]]
|
|
;
|
|
|
|
entry:
|
|
%cond1 = load volatile i1, i1* %dummy
|
|
br i1 %cond1, label %if, label %end
|
|
|
|
if:
|
|
%cond2 = load volatile i1, i1* %dummy
|
|
br i1 %cond2, label %then, label %end
|
|
|
|
then:
|
|
br label %end
|
|
|
|
end:
|
|
%x1 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 1 to i8*), %then ]
|
|
%x2 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 2 to i8*), %then ]
|
|
%x3 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 3 to i8*), %then ]
|
|
%x4 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 4 to i8*), %then ]
|
|
%x5 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 5 to i8*), %then ]
|
|
%x6 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 6 to i8*), %then ]
|
|
%x7 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 7 to i8*), %then ]
|
|
%x8 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 8 to i8*), %then ]
|
|
%x9 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 9 to i8*), %then ]
|
|
%x10 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 10 to i8*), %then ]
|
|
|
|
ret i8* %x10
|
|
}
|
|
|
|
define i32* @test5(i32 %a, i32 %b, i32 %c, i32* dereferenceable(10) %ptr1,
|
|
; CHECK-LABEL: @test5(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[B:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[TMP1]], label [[BB1:%.*]], label [[BB3:%.*]]
|
|
; CHECK: bb1:
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[C:%.*]], 1
|
|
; CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[PTR3:%.*]]
|
|
; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP2]], i32* [[TMP3]], i32* [[PTR2:%.*]]
|
|
; CHECK-NEXT: ret i32* [[SPEC_SELECT]]
|
|
; CHECK: bb3:
|
|
; CHECK-NEXT: ret i32* [[PTR1:%.*]]
|
|
;
|
|
i32* dereferenceable(10) %ptr2, i32** dereferenceable(10) %ptr3) nounwind {
|
|
entry:
|
|
%tmp1 = icmp eq i32 %b, 0
|
|
br i1 %tmp1, label %bb1, label %bb3
|
|
|
|
bb1: ; preds = %entry
|
|
%tmp2 = icmp sgt i32 %c, 1
|
|
br i1 %tmp2, label %bb2, label %bb3
|
|
|
|
bb2: ; preds = bb1
|
|
%tmp3 = load i32*, i32** %ptr3, !dereferenceable !{i64 10}
|
|
br label %bb3
|
|
|
|
bb3: ; preds = %bb2, %entry
|
|
%tmp4 = phi i32* [ %ptr1, %entry ], [ %ptr2, %bb1 ], [ %tmp3, %bb2 ]
|
|
ret i32* %tmp4
|
|
}
|