Stacked on #156923 In https://godbolt.org/z/8svWaredK, we spill a lot on RISC-V because whilst the largest element type is i8, we generate a bunch of pointer vectors for gathers and scatters. This means the VF chosen is quite high e.g. <vscale x 16 x i8>, but we end up using a bunch of <vscale x 16 x i64> m8 registers for the pointers. This was briefly fixed by #132190 where we computed register pressure in VPlan and used it to prune VFs that were likely to spill. The legacy cost model wasn't able to do this pruning because it didn't have visibility into the pointer vectors that were needed for the gathers/scatters. However VF pruning was restricted again to just the case when max bandwidth was enabled in #141736 to avoid an AArch64 regression, and restricted again in #149056 to only prune VFs that had max bandwidth enabled. On RISC-V we take advantage of register grouping for performance and choose a default of LMUL 2, which means there are 16 registers to work with – half the number as SVE, so we encounter higher register pressure more frequently. As such, we likely want to always consider pruning VFs with high register pressure and not just the VFs from max bandwidth. This adds a TTI hook to opt into this behaviour for RISC-V which fixes the motivating godbolt example above. When last checked this significantly reduces the number of spills on SPEC CPU 2017, up to 80% on 538.imagick_r.
34 lines
1.4 KiB
LLVM
34 lines
1.4 KiB
LLVM
; REQUIRES: asserts
|
|
; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s
|
|
|
|
define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) {
|
|
; CHECK-LABEL: add
|
|
; CHECK: LV(REG): VF = vscale x 4
|
|
; CHECK-NEXT: LV(REG): Found max usage: 2 item
|
|
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 6 registers
|
|
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers
|
|
; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
|
|
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
|
|
|
|
entry:
|
|
%conv = zext i32 %size to i64
|
|
%cmp10.not = icmp eq i32 %size, 0
|
|
br i1 %cmp10.not, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup:
|
|
ret void
|
|
|
|
for.body:
|
|
%i.011 = phi i64 [ %add4, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds bfloat, ptr %src1, i64 %i.011
|
|
%0 = load bfloat, ptr %arrayidx, align 4
|
|
%arrayidx2 = getelementptr inbounds bfloat, ptr %src2, i64 %i.011
|
|
%1 = load bfloat, ptr %arrayidx2, align 4
|
|
%add = fadd bfloat %0, %1
|
|
%arrayidx3 = getelementptr inbounds bfloat, ptr %result, i64 %i.011
|
|
store bfloat %add, ptr %arrayidx3, align 4
|
|
%add4 = add nuw nsw i64 %i.011, 1
|
|
%exitcond.not = icmp eq i64 %add4, %conv
|
|
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
|
|
}
|