llvm-project/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll
Philip Reames 2175c6cb38
[RISCV] Set AllocationPriority in line with LMUL (#131176)
This mechanism causes the greedy register allocator to prefer allocating
register classes with higher priority first. This helps to ensure that
high LMUL registers obtain a register without having to go through the
eviction mechanism. In practice, it seems to cause a bunch of code
churn, and some minor improvement around widening and narrowing
operations.

In a few of the widening tests, we have what look like code size
regressions because we end up with two smaller register class copies
instead of one larger one after the instruction. However, in any larger
code sequence, these are likely to be folded into the producing
instructions. (But so were the wider copies after the operation.)

Two observations:
1) We're not setting the greedy-regclass-priority-trumps-globalness flag
   on the register class, so this doesn't help long mask ranges.  I
   thought about doing that, but the benefit is non-obvious, so I
   decided it was worth a separate change at minimum.
2) We could arguably set the priority higher for the register classes
   that exclude v0.  I tried that, and it caused a whole bunch of
   further churn.  I may return to it in a separate patch.
2025-03-18 08:25:49 -07:00

51 lines
2.0 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
define signext i32 @sum(ptr %a, i32 signext %n, i1 %prof.min.iters.check, <vscale x 8 x i1> %0, <vscale x 8 x i1> %1) {
; CHECK-LABEL: sum:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: andi a2, a2, 1
; CHECK-NEXT: beqz a2, .LBB0_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: li a3, 0
; CHECK-NEXT: .LBB0_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: mv a2, a3
; CHECK-NEXT: lw a3, 0(a0)
; CHECK-NEXT: addi a0, a0, 4
; CHECK-NEXT: bnez a1, .LBB0_2
; CHECK-NEXT: # %bb.3: # %for.end
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_4: # %vector.ph
; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: vsetivli zero, 1, e32, m4, ta, ma
; CHECK-NEXT: vredsum.vs v8, v8, v12, v0.t
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
br i1 %prof.min.iters.check, label %for.body, label %vector.ph
vector.ph: ; preds = %entry
%2 = tail call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, <vscale x 8 x i32> zeroinitializer, <vscale x 8 x i1> %0, i32 1)
br label %for.end
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%red.05 = phi i32 [ %3, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr i32, ptr %a, i64 %indvars.iv
%3 = load i32, ptr %arrayidx, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%exitcond.not = icmp eq i32 %n, 0
br i1 %exitcond.not, label %for.end, label %for.body
for.end: ; preds = %for.body, %vector.ph
%red.0.lcssa = phi i32 [ %2, %vector.ph ], [ %red.05, %for.body ]
ret i32 %red.0.lcssa
}
declare i32 @llvm.vp.reduce.add.nxv8i32(i32, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)