llvm-project/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
Iasonaskrpr 6c6fb00c94
[AMDGPU] Optimize S_OR_B32 to S_ADDK_I32 where possible (#177949)
This PR fixes #177753, converting disjoint S_OR_B32 to S_ADDK_I32
whenever possible, it avoids this transformation in case S_OR_B32 can be
converted to bitset.

Note on Test Failures (Draft Status) This change causes significant
register reshuffling across the test suite due to the new allocation
hints and the swaps performed in case src0 is not a register and src1,
along with the change from or to addk. To avoid a massive, noisy diff
during the initial logic review:

This Draft PR only includes a representative sample of updated tests.
CodeGen/AMDGPU/combine-reg-or-const.ll -> Showcases change from S_OR to
S_ADDK
CodeGen/AMDGPU/s-barrier.ll -> Showcases swap between Src0 and Src1 if
src0 is not a register

The rest of the tests show the result of the register allocation hint we
give, I have checked every test I updated and they seem ok to me.

Once the core logic is approved, I will run the update script across the
remaining ~70 failing tests and mark the PR as "Ready for Review."
2026-02-07 09:10:12 +00:00

56 lines
2.2 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck %s
; The OR instruction should not be eliminated by the "OR Combine" DAG optimization.
define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
; CHECK-LABEL: _Z11test_kernelPii:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-NEXT: s_add_i32 s12, s12, s17
; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s0, 3
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %if.then
; CHECK-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; CHECK-NEXT: s_and_b32 s4, s0, 0xffff
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: s_mul_i32 s6, s4, 0xaaab
; CHECK-NEXT: s_lshl_b64 s[4:5], s[0:1], 2
; CHECK-NEXT: s_lshr_b32 s1, s6, 19
; CHECK-NEXT: s_mul_i32 s1, s1, 12
; CHECK-NEXT: s_sub_i32 s6, s0, s1
; CHECK-NEXT: s_and_b32 s7, s6, 0xffff
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_add_u32 s0, s2, s4
; CHECK-NEXT: s_addc_u32 s1, s3, s5
; CHECK-NEXT: s_bfe_u32 s2, s6, 0xd0003
; CHECK-NEXT: s_add_i32 s2, s2, s7
; CHECK-NEXT: s_addk_i32 s2, 0xc0
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: flat_store_dword v[0:1], v2
; CHECK-NEXT: .LBB0_2: ; %if.end
; CHECK-NEXT: s_endpgm
entry:
%cmp = icmp eq i32 %s, 3
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%rem.lhs.trunc = trunc i32 %s to i16
%rem4 = urem i16 %rem.lhs.trunc, 12
%rem.zext = zext i16 %rem4 to i32
%idxprom = zext i32 %s to i64
%arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %Ad.coerce, i64 %idxprom
%div = lshr i32 %rem.zext, 3
%or = or i32 %rem.zext, 192
%add = add nuw nsw i32 %or, %div
store i32 %add, ptr addrspace(1) %arrayidx3, align 4
br label %if.end
if.end: ; preds = %if.then, %entry
ret void
}