llvm-project/llvm/test/CodeGen/AMDGPU/triton_regression_no_waterfall.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck --check-prefix=GCN %s

define amdgpu_kernel void @test_should_convert_to_v_readfirstlane_b32(float %fval, i32 %arg1, i32 %arg2) {
; GCN-LABEL: test_should_convert_to_v_readfirstlane_b32:
; GCN:       ; %bb.0: ; %entry
; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_cvt_u32_f32_e32 v0, s0
; GCN-NEXT:    s_nop 0
; GCN-NEXT:    v_readfirstlane_b32 s0, v0
; GCN-NEXT:    s_lshl_b32 s0, s0, 16
; GCN-NEXT:    s_or_b32 s5, s0, s1
; GCN-NEXT:    s_and_b32 s6, s5, s2
; GCN-NEXT:    s_lshr_b32 s4, s6, 2
; GCN-NEXT:    s_mov_b32 s7, s4
; GCN-NEXT:    v_mov_b32_e32 v0, s1
; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0 nt
; GCN-NEXT:    s_endpgm
entry:
  %conv = fptoui float %fval to i32

  %shl = shl i32 %conv, 16
  %or = or i32 %shl, %arg1
  %and = and i32 %or, %arg2
  %shr = lshr i32 %and, 2

  %sgpr128_0 = insertelement <4 x i32> poison, i32 %shr, i32 0
  %sgpr128_1 = insertelement <4 x i32> %sgpr128_0, i32 %or, i32 1
  %sgpr128_2 = insertelement <4 x i32> %sgpr128_1, i32 %and, i32 2
  %sgpr128_3 = insertelement <4 x i32> %sgpr128_2, i32 %shr, i32 3

  call void @llvm.amdgcn.raw.buffer.store.i32(i32 %arg1, <4 x i32> %sgpr128_3, i32 0, i32 0, i32 2)

  ret void
}

declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) #0

attributes #0 = { nounwind }