AMDGPU previously had no target-specific LSR cost model, so the generic heuristic would often introduce extra induction variables and base-add chains that hurt VALU throughput on GFX9+ (observed on gfx942). Implement a custom cost model: - isLSRCostLess(): prioritize per-iteration instruction count over setup costs, penalize IV multiplies, and demote register count. Pre-GFX9 falls back to the default comparator. - getScalingFactorCost(): report that base+scale*index addressing requires an extra ADD instruction. - isNumRegsMajorCostOfLSR(): return false. - shouldDropLSRSolutionIfLessProfitable(): return true. Assisted-by: Claude Opus
43 lines
1.5 KiB
LLVM
43 lines
1.5 KiB
LLVM
; RUN: llc -O3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
|
|
|
|
; Reduced from rocrand's threefry2x32_20 kernel.
|
|
; The AMDGPU LSR cost model should avoid creating a redundant VGPR induction
|
|
; variable when the loop already has a vector IV incremented by a uniform
|
|
; (SGPR) stride. Without the cost model fix, LSR introduces a second v_add
|
|
; in the loop body, wasting a VGPR and a VALU slot every iteration.
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
|
|
|
; CHECK-LABEL: {{^}}lsr_vector_iv_cost:
|
|
; The loop must contain exactly one VALU add — the single vector IV update.
|
|
; A second v_add_u32 here would mean LSR created a redundant IV.
|
|
; CHECK: {{^}}.LBB0_1:
|
|
; CHECK: v_add_u32
|
|
; CHECK-NOT: v_add_u32
|
|
; CHECK: s_cbranch
|
|
define amdgpu_kernel void @lsr_vector_iv_cost(<2 x i32> %arg0, i32 %stride, ptr addrspace(1) %out) {
|
|
entry:
|
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv.pn = phi i32 [ 0, %entry ], [ %or, %loop ]
|
|
%iv.vec = phi i32 [ %tid, %entry ], [ %sum1, %loop ]
|
|
%sum1 = add i32 %iv.vec, %stride
|
|
%elt = extractelement <2 x i32> %arg0, i64 0
|
|
%sum2 = add i32 %sum1, %elt
|
|
%xor = xor i32 1, %sum2
|
|
%sum3 = add i32 %sum2, %xor
|
|
%sum4 = add i32 %sum3, %elt
|
|
%or = or i32 %sum4, %stride
|
|
%shr = lshr i32 %iv.pn, 1
|
|
%cmp = icmp ult i32 %sum1, 1024
|
|
br i1 %cmp, label %loop, label %exit
|
|
|
|
exit:
|
|
store i32 %or, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|