In 531.deepsjeng_r from SPEC CPU 2017 there's a loop that we
unprofitably loop vectorize on RISC-V.
The loop looks something like:
```c
for (int i = 0; i < n; i++) {
if (x0[i] == a)
if (x1[i] == b)
if (x2[i] == c)
// do stuff...
}
```
Because it's so deeply nested the actual inner level of the loop rarely
gets executed. However we still deem it profitable to vectorize, which
due to the if-conversion means we now always execute the body.
This stems from the fact that `getPredBlockCostDivisor` currently
assumes that blocks have 50% chance of being executed as a heuristic.
We can fix this by using BlockFrequencyInfo, which gives a more accurate
estimate of the innermost block being executed 12.5% of the time. We can
then calculate the probability as `HeaderFrequency / BlockFrequency`.
Fixing the cost here gives a 7% speedup for 531.deepsjeng_r on RISC-V.
Whilst there's a lot of changes in the in-tree tests, this doesn't
affect llvm-test-suite or SPEC CPU 2017 that much:
- On armv9-a -flto -O3 there's 0.0%/0.2% more geomean loops vectorized
on llvm-test-suite/SPEC CPU 2017.
- On x86-64 -flto -O3 **with PGO** there's 0.9%/0% less geomean loops
vectorized on llvm-test-suite/SPEC CPU 2017.
Overall geomean compile time impact is 0.03% on stage1-ReleaseLTO:
https://llvm-compile-time-tracker.com/compare.php?from=9eee396c58d2e24beb93c460141170def328776d&to=32fbff48f965d03b51549fdf9bbc4ca06473b623&stat=instructions%3Au
62 lines
2.5 KiB
LLVM
62 lines
2.5 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
|
|
; REQUIRES: riscv-registered-target
|
|
; RUN: opt -p 'lto<O3>' -mtriple riscv64 -mattr=+v -S < %s | FileCheck %s
|
|
|
|
; Test that BlockFrequencyInfo is invalidated after loop passes, so it's
|
|
; accurate whenever LoopVectorize uses it. LoopVectorizer requires that
|
|
; innermost loop headers have a greater than or equal to frequency than any
|
|
; block it dominates.
|
|
|
|
define void @f(i1 %x) !prof !0 {
|
|
; CHECK-LABEL: define void @f(
|
|
; CHECK-SAME: i1 [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {{.*}}{
|
|
; CHECK-NEXT: [[ENTRY:.*]]:
|
|
; CHECK-NEXT: [[DOTSCALAR:%.*]] = xor i1 [[X]], true
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[DOTSCALAR]], i64 0
|
|
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
|
|
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; CHECK: [[VECTOR_BODY]]:
|
|
; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 65, %[[ENTRY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
|
|
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr null, i64 [[EVL_BASED_IV]]
|
|
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> poison, ptr align 8 [[TMP4]], <vscale x 2 x i1> [[TMP2]], i32 [[TMP3]])
|
|
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> poison, ptr align 8 [[TMP4]], <vscale x 2 x i1> [[TMP2]], i32 [[TMP3]])
|
|
; CHECK-NEXT: [[TMP5:%.*]] = zext nneg i32 [[TMP3]] to i64
|
|
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[EVL_BASED_IV]], [[TMP5]]
|
|
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw nsw i64 [[AVL]], [[TMP5]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
|
|
; CHECK-NEXT: br i1 [[TMP6]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
|
|
; CHECK: [[EXIT]]:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv = phi i64 [ %iv.next, %latch ], [ 0, %entry ]
|
|
%gep = getelementptr i64, ptr null, i64 %iv
|
|
br label %foo
|
|
|
|
foo:
|
|
%phi = phi i1 [ false, %loop ], [ true, %baz ]
|
|
br i1 %x, label %baz, label %bar
|
|
|
|
bar:
|
|
store i64 0, ptr %gep
|
|
br label %baz
|
|
|
|
baz:
|
|
br i1 %phi, label %latch, label %foo
|
|
|
|
latch:
|
|
%iv.next = add i64 %iv, 1
|
|
%ec = icmp eq i64 %iv, 64
|
|
br i1 %ec, label %exit, label %loop
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
!0 = !{!"function_entry_count", i64 1}
|