Luke Lau 4bb250d6a3
[VPlan] Always consider register pressure on RISC-V (#156951)
Stacked on #156923 

In https://godbolt.org/z/8svWaredK, we spill a lot on RISC-V because
whilst the largest element type is i8, we generate a bunch of pointer
vectors for gathers and scatters. This means the VF chosen is quite high
e.g. <vscale x 16 x i8>, but we end up using a bunch of <vscale x 16 x
i64> m8 registers for the pointers.

This was briefly fixed by #132190 where we computed register pressure in
VPlan and used it to prune VFs that were likely to spill. The legacy
cost model wasn't able to do this pruning because it didn't have
visibility into the pointer vectors that were needed for the
gathers/scatters.

However VF pruning was restricted again to just the case when max
bandwidth was enabled in #141736 to avoid an AArch64 regression, and
restricted again in #149056 to only prune VFs that had max bandwidth
enabled.

On RISC-V we take advantage of register grouping for performance and
choose a default of LMUL 2, which means there are 16 registers to work
with – half the number as SVE, so we encounter higher register pressure
more frequently.

As such, we likely want to always consider pruning VFs with high
register pressure and not just the VFs from max bandwidth.

This adds a TTI hook to opt into this behaviour for RISC-V which fixes
the motivating godbolt example above. When last checked this
significantly reduces the number of spills on SPEC CPU 2017, up to
80% on 538.imagick_r.
2025-09-12 06:21:54 +00:00

127 lines
6.2 KiB
LLVM

; REQUIRES: asserts
; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \
; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \
; RUN: -force-vector-width=1 -prefer-predicate-over-epilogue=scalar-epilogue \
; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-SCALAR
; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \
; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \
; RUN: -riscv-v-register-bit-width-lmul=1 \
; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL1
; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \
; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \
; RUN: -riscv-v-register-bit-width-lmul=2 \
; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL2
; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \
; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \
; RUN: -riscv-v-register-bit-width-lmul=4 \
; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL4
; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \
; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \
; RUN: -riscv-v-register-bit-width-lmul=8 \
; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL8
define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) {
; CHECK-SCALAR-LABEL: add
; CHECK-SCALAR: LV(REG): VF = 1
; CHECK-SCALAR-NEXT: LV(REG): Found max usage: 2 item
; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::FPRRC, 2 registers
; CHECK-SCALAR-NEXT: LV(REG): Found invariant usage: 1 item
; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
; CHECK-LMUL1-LABEL: add
; CHECK-LMUL1: LV(REG): VF = vscale x 2
; CHECK-LMUL1-NEXT: LV(REG): Found max usage: 2 item
; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers
; CHECK-LMUL1-NEXT: LV(REG): Found invariant usage: 1 item
; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
; CHECK-LMUL2-LABEL: add
; CHECK-LMUL2: LV(REG): VF = vscale x 4
; CHECK-LMUL2-NEXT: LV(REG): Found max usage: 2 item
; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers
; CHECK-LMUL2-NEXT: LV(REG): Found invariant usage: 1 item
; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
; CHECK-LMUL4-LABEL: add
; CHECK-LMUL4: LV(REG): VF = vscale x 8
; CHECK-LMUL4-NEXT: LV(REG): Found max usage: 2 item
; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers
; CHECK-LMUL4-NEXT: LV(REG): Found invariant usage: 1 item
; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
; CHECK-LMUL8-LABEL: add
; CHECK-LMUL8: LV(REG): VF = vscale x 16
; CHECK-LMUL8-NEXT: LV(REG): Found max usage: 2 item
; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 16 registers
; CHECK-LMUL8-NEXT: LV(REG): Found invariant usage: 1 item
; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
entry:
%conv = zext i32 %size to i64
%cmp10.not = icmp eq i32 %size, 0
br i1 %cmp10.not, label %for.cond.cleanup, label %for.body
for.cond.cleanup:
ret void
for.body:
%i.011 = phi i64 [ %add4, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds float, ptr %src1, i64 %i.011
%0 = load float, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, ptr %src2, i64 %i.011
%1 = load float, ptr %arrayidx2, align 4
%add = fadd float %0, %1
%arrayidx3 = getelementptr inbounds float, ptr %result, i64 %i.011
store float %add, ptr %arrayidx3, align 4
%add4 = add nuw nsw i64 %i.011, 1
%exitcond.not = icmp eq i64 %add4, %conv
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
define void @goo(ptr nocapture noundef %a, i32 noundef signext %n) {
; CHECK-SCALAR-LABEL: goo
; CHECK-SCALAR: LV(REG): VF = 1
; CHECK-SCALAR-NEXT: LV(REG): Found max usage: 1 item
; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
; CHECK-LMUL1: LV(REG): VF = vscale x 2
; CHECK-LMUL1-NEXT: LV(REG): Found max usage: 2 item
; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers
; CHECK-LMUL2: LV(REG): VF = vscale x 4
; CHECK-LMUL2-NEXT: LV(REG): Found max usage: 2 item
; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers
; CHECK-LMUL4: LV(REG): VF = vscale x 8
; CHECK-LMUL4-NEXT: LV(REG): Found max usage: 2 item
; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers
; CHECK-LMUL8: LV(REG): VF = vscale x 16
; CHECK-LMUL8-NEXT: LV(REG): Found max usage: 2 item
; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 16 registers
entry:
%cmp3 = icmp sgt i32 %n, 0
br i1 %cmp3, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %n to i64
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv
%0 = load ptr, ptr %arrayidx, align 8
%add.ptr = getelementptr inbounds i32, ptr %0, i64 1
store ptr %add.ptr, ptr %arrayidx, align 8
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
}