Propagate demanded bits through readfirstlane intrinsic in AMDGPUISelLowering with SimplifyDemandedBitsForTargetNode implementation. This allows upstream zero/sign extensions to be eliminated when only a subset of bits is used after the intrinsic. Partially addresses #128390.
31 lines
1.1 KiB
LLVM
31 lines
1.1 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
|
|
|
|
define amdgpu_gs i32 @main() {
|
|
; CHECK-LABEL: main:
|
|
; CHECK: ; %bb.0: ; %bb
|
|
; CHECK-NEXT: s_mov_b32 s0, 0
|
|
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; CHECK-NEXT: s_bitcmp1_b32 s0, 0
|
|
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
|
|
; CHECK-NEXT: s_xor_b32 s0, s0, -1
|
|
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
|
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
|
; CHECK-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; CHECK-NEXT: ; return to shader part epilog
|
|
bb:
|
|
%i = call i1 @llvm.amdgcn.readfirstlane.i1(i1 false)
|
|
br label %bb1
|
|
|
|
bb1:
|
|
%i2 = zext i1 %i to i32
|
|
%i3 = call i32 @llvm.amdgcn.wwm.i32(i32 0)
|
|
%i4 = call i32 @llvm.amdgcn.wwm.i32(i32 %i2)
|
|
%i5 = trunc i32 %i4 to i1
|
|
%i6 = trunc i32 %i3 to i1
|
|
%i7 = or i1 %i6, %i5
|
|
%i8 = select i1 %i7, i32 0, i32 1
|
|
ret i32 %i8
|
|
}
|