Luke Lau e8219e5ce8
[VPlan] Use BlockFrequencyInfo in getPredBlockCostDivisor (#158690)
In 531.deepsjeng_r from SPEC CPU 2017 there's a loop that we
unprofitably loop vectorize on RISC-V.

The loop looks something like:

```c
  for (int i = 0; i < n; i++) {
    if (x0[i] == a)
      if (x1[i] == b)
        if (x2[i] == c)
          // do stuff...
  }
```

Because it's so deeply nested the actual inner level of the loop rarely
gets executed. However we still deem it profitable to vectorize, which
due to the if-conversion means we now always execute the body.

This stems from the fact that `getPredBlockCostDivisor` currently
assumes that blocks have 50% chance of being executed as a heuristic.

We can fix this by using BlockFrequencyInfo, which gives a more accurate
estimate of the innermost block being executed 12.5% of the time. We can
then calculate the probability as `HeaderFrequency / BlockFrequency`.

Fixing the cost here gives a 7% speedup for 531.deepsjeng_r on RISC-V.

Whilst there's a lot of changes in the in-tree tests, this doesn't
affect llvm-test-suite or SPEC CPU 2017 that much:

- On armv9-a -flto -O3 there's 0.0%/0.2% more geomean loops vectorized
on llvm-test-suite/SPEC CPU 2017.
- On x86-64 -flto -O3 **with PGO** there's 0.9%/0% less geomean loops
vectorized on llvm-test-suite/SPEC CPU 2017.

Overall geomean compile time impact is 0.03% on stage1-ReleaseLTO:
https://llvm-compile-time-tracker.com/compare.php?from=9eee396c58d2e24beb93c460141170def328776d&to=32fbff48f965d03b51549fdf9bbc4ca06473b623&stat=instructions%3Au
2025-12-08 14:28:26 +00:00

62 lines
2.5 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
; REQUIRES: riscv-registered-target
; RUN: opt -p 'lto<O3>' -mtriple riscv64 -mattr=+v -S < %s | FileCheck %s
; Test that BlockFrequencyInfo is invalidated after loop passes, so it's
; accurate whenever LoopVectorize uses it. LoopVectorizer requires that
; innermost loop headers have a greater than or equal to frequency than any
; block it dominates.
define void @f(i1 %x) !prof !0 {
; CHECK-LABEL: define void @f(
; CHECK-SAME: i1 [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {{.*}}{
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[DOTSCALAR:%.*]] = xor i1 [[X]], true
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[DOTSCALAR]], i64 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 65, %[[ENTRY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr null, i64 [[EVL_BASED_IV]]
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> poison, ptr align 8 [[TMP4]], <vscale x 2 x i1> [[TMP2]], i32 [[TMP3]])
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> poison, ptr align 8 [[TMP4]], <vscale x 2 x i1> [[TMP2]], i32 [[TMP3]])
; CHECK-NEXT: [[TMP5:%.*]] = zext nneg i32 [[TMP3]] to i64
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[EVL_BASED_IV]], [[TMP5]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw nsw i64 [[AVL]], [[TMP5]]
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
; CHECK-NEXT: br i1 [[TMP6]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
entry:
br label %loop
loop:
%iv = phi i64 [ %iv.next, %latch ], [ 0, %entry ]
%gep = getelementptr i64, ptr null, i64 %iv
br label %foo
foo:
%phi = phi i1 [ false, %loop ], [ true, %baz ]
br i1 %x, label %baz, label %bar
bar:
store i64 0, ptr %gep
br label %baz
baz:
br i1 %phi, label %latch, label %foo
latch:
%iv.next = add i64 %iv, 1
%ec = icmp eq i64 %iv, 64
br i1 %ec, label %exit, label %loop
exit:
ret void
}
!0 = !{!"function_entry_count", i64 1}