
This mechanism causes the greedy register allocator to prefer allocating register classes with higher priority first. This helps to ensure that high LMUL registers obtain a register without having to go through the eviction mechanism. In practice, it seems to cause a bunch of code churn, and some minor improvement around widening and narrowing operations. In a few of the widening tests, we have what look like code size regressions because we end up with two smaller register class copies instead of one larger one after the instruction. However, in any larger code sequence, these are likely to be folded into the producing instructions. (But so were the wider copies after the operation.) Two observations: 1) We're not setting the greedy-regclass-priority-trumps-globalness flag on the register class, so this doesn't help long mask ranges. I thought about doing that, but the benefit is non-obvious, so I decided it was worth a separate change at minimum. 2) We could arguably set the priority higher for the register classes that exclude v0. I tried that, and it caused a whole bunch of further churn. I may return to it in a separate patch.
51 lines
2.0 KiB
LLVM
51 lines
2.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
|
|
; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
|
|
|
|
|
|
define signext i32 @sum(ptr %a, i32 signext %n, i1 %prof.min.iters.check, <vscale x 8 x i1> %0, <vscale x 8 x i1> %1) {
|
|
; CHECK-LABEL: sum:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: andi a2, a2, 1
|
|
; CHECK-NEXT: beqz a2, .LBB0_4
|
|
; CHECK-NEXT: # %bb.1: # %for.body.preheader
|
|
; CHECK-NEXT: li a3, 0
|
|
; CHECK-NEXT: .LBB0_2: # %for.body
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: mv a2, a3
|
|
; CHECK-NEXT: lw a3, 0(a0)
|
|
; CHECK-NEXT: addi a0, a0, 4
|
|
; CHECK-NEXT: bnez a1, .LBB0_2
|
|
; CHECK-NEXT: # %bb.3: # %for.end
|
|
; CHECK-NEXT: mv a0, a2
|
|
; CHECK-NEXT: ret
|
|
; CHECK-NEXT: .LBB0_4: # %vector.ph
|
|
; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
|
|
; CHECK-NEXT: vmv.s.x v12, zero
|
|
; CHECK-NEXT: vmv.v.i v8, 0
|
|
; CHECK-NEXT: vsetivli zero, 1, e32, m4, ta, ma
|
|
; CHECK-NEXT: vredsum.vs v8, v8, v12, v0.t
|
|
; CHECK-NEXT: vmv.x.s a0, v8
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
br i1 %prof.min.iters.check, label %for.body, label %vector.ph
|
|
|
|
vector.ph: ; preds = %entry
|
|
%2 = tail call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, <vscale x 8 x i32> zeroinitializer, <vscale x 8 x i1> %0, i32 1)
|
|
br label %for.end
|
|
|
|
for.body: ; preds = %for.body, %entry
|
|
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
|
%red.05 = phi i32 [ %3, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr i32, ptr %a, i64 %indvars.iv
|
|
%3 = load i32, ptr %arrayidx, align 4
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
%exitcond.not = icmp eq i32 %n, 0
|
|
br i1 %exitcond.not, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %vector.ph
|
|
%red.0.lcssa = phi i32 [ %2, %vector.ph ], [ %red.05, %for.body ]
|
|
ret i32 %red.0.lcssa
|
|
}
|
|
|
|
declare i32 @llvm.vp.reduce.add.nxv8i32(i32, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
|