llvm-project/llvm/test/CodeGen/AMDGPU/lsr-cost-model-vector-iv.ll
michaelselehov 621fc8774e
[AMDGPU] Implement LSR cost model for GFX9+ (#184138)
AMDGPU previously had no target-specific LSR cost model, so the generic
heuristic would often introduce extra induction variables and base-add
chains that hurt VALU throughput on GFX9+ (observed on gfx942).

Implement a custom cost model:

- isLSRCostLess(): prioritize per-iteration instruction count over setup
costs, penalize IV multiplies, and demote register count. Pre-GFX9 falls
back to the default comparator.
- getScalingFactorCost(): report that base+scale*index addressing
requires an extra ADD instruction.
- isNumRegsMajorCostOfLSR(): return false.
- shouldDropLSRSolutionIfLessProfitable(): return true.

Assisted-by: Claude Opus
2026-03-23 12:18:11 +01:00

43 lines
1.5 KiB
LLVM

; RUN: llc -O3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
; Reduced from rocrand's threefry2x32_20 kernel.
; The AMDGPU LSR cost model should avoid creating a redundant VGPR induction
; variable when the loop already has a vector IV incremented by a uniform
; (SGPR) stride. Without the cost model fix, LSR introduces a second v_add
; in the loop body, wasting a VGPR and a VALU slot every iteration.
declare i32 @llvm.amdgcn.workitem.id.x() #0
; CHECK-LABEL: {{^}}lsr_vector_iv_cost:
; The loop must contain exactly one VALU add — the single vector IV update.
; A second v_add_u32 here would mean LSR created a redundant IV.
; CHECK: {{^}}.LBB0_1:
; CHECK: v_add_u32
; CHECK-NOT: v_add_u32
; CHECK: s_cbranch
define amdgpu_kernel void @lsr_vector_iv_cost(<2 x i32> %arg0, i32 %stride, ptr addrspace(1) %out) {
entry:
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %loop
loop:
%iv.pn = phi i32 [ 0, %entry ], [ %or, %loop ]
%iv.vec = phi i32 [ %tid, %entry ], [ %sum1, %loop ]
%sum1 = add i32 %iv.vec, %stride
%elt = extractelement <2 x i32> %arg0, i64 0
%sum2 = add i32 %sum1, %elt
%xor = xor i32 1, %sum2
%sum3 = add i32 %sum2, %xor
%sum4 = add i32 %sum3, %elt
%or = or i32 %sum4, %stride
%shr = lshr i32 %iv.pn, 1
%cmp = icmp ult i32 %sum1, 1024
br i1 %cmp, label %loop, label %exit
exit:
store i32 %or, ptr addrspace(1) %out
ret void
}
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }