Min-Yih Hsu 7ebbbd885f
[DAG] Always use stack to promote bitcast when the source is vector (#151065)
The optimization introduced by #125637 tried to avoid using stacks to
promote bitcast with vector result type. However, it wouldn't be correct
if the input type is vector. This patch limits that optimizations to
only scalar to vector bitcasts.
2025-08-02 15:32:10 -07:00

5075 lines
192 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck --check-prefix=EG %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefix=CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck --check-prefix=VI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
; TODO: FIXME-TRUE16 - Enable this llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-TRUE16 %s
; Crashing on v_test_imin_slt_i16
; LLVM ERROR: Cannot select: 0x5f895f65b050: i16,ch = load<(load (s16) from %ir.b.gep, addrspace 1)>
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-FAKE16 %s
define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; EG-LABEL: v_test_imin_sle_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MIN_INT T0.X, T0.X, T1.X,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_imin_sle_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_load_dword v5, v[0:1]
; CI-NEXT: flat_load_dword v2, v[2:3]
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_min_i32_e32 v2, v5, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: v_test_imin_sle_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_min_i32_e32 v2, v5, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imin_sle_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_i32_e32 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_imin_sle_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_min_i32_e32 v1, v1, v2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_imin_sle_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_min_i32_e32 v1, v1, v2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: v_test_imin_sle_i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_min_i32_e32 v1, v1, v2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
%b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
%out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
%a = load i32, ptr addrspace(1) %a.gep, align 4
%b = load i32, ptr addrspace(1) %b.gep, align 4
%cmp = icmp sle i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
store i32 %val, ptr addrspace(1) %out.gep, align 4
ret void
}
define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
; EG-LABEL: s_test_imin_sle_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_imin_sle_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_sle_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_sle_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_min_i32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_sle_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_min_i32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_sle_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_min_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imin_sle_i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_min_i32 s2, s2, s3
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp sle i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
store i32 %val, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 {
; EG-LABEL: s_test_imin_sle_v1i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_imin_sle_v1i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_sle_v1i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_sle_v1i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_min_i32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_sle_v1i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_min_i32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_sle_v1i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_min_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imin_sle_v1i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_min_i32 s2, s2, s3
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp sle <1 x i32> %a, %b
%val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
store <1 x i32> %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b) #0 {
; EG-LABEL: s_test_imin_sle_v4i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MIN_INT * T0.W, KC0[4].X, KC0[5].X,
; EG-NEXT: MIN_INT * T0.Z, KC0[3].W, KC0[4].W,
; EG-NEXT: MIN_INT * T0.Y, KC0[3].Z, KC0[4].Z,
; EG-NEXT: MIN_INT * T0.X, KC0[3].Y, KC0[4].Y,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_imin_sle_v4i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s3, s3, s7
; CI-NEXT: s_min_i32 s2, s2, s6
; CI-NEXT: s_min_i32 s1, s1, s5
; CI-NEXT: s_min_i32 s0, s0, s4
; CI-NEXT: v_mov_b32_e32 v4, s8
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: v_mov_b32_e32 v5, s9
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_sle_v4i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s3, s3, s7
; VI-NEXT: s_min_i32 s2, s2, s6
; VI-NEXT: s_min_i32 s1, s1, s5
; VI-NEXT: s_min_i32 s0, s0, s4
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_sle_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_min_i32 s3, s3, s7
; GFX9-NEXT: s_min_i32 s2, s2, s6
; GFX9-NEXT: s_min_i32 s1, s1, s5
; GFX9-NEXT: s_min_i32 s0, s0, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_sle_v4i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; GFX10-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_min_i32 s3, s3, s7
; GFX10-NEXT: s_min_i32 s2, s2, s6
; GFX10-NEXT: s_min_i32 s0, s0, s4
; GFX10-NEXT: s_min_i32 s1, s1, s5
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_sle_v4i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x10
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_min_i32 s2, s11, s15
; GFX11-NEXT: s_min_i32 s3, s10, s14
; GFX11-NEXT: s_min_i32 s4, s8, s12
; GFX11-NEXT: s_min_i32 s5, s9, s13
; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: v_mov_b32_e32 v1, s5
; GFX11-NEXT: v_mov_b32_e32 v2, s3
; GFX11-NEXT: v_mov_b32_e32 v3, s2
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imin_sle_v4i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x10
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v4, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_min_i32 s2, s11, s15
; GFX1250-NEXT: s_min_i32 s3, s10, s14
; GFX1250-NEXT: s_min_i32 s4, s8, s12
; GFX1250-NEXT: s_min_i32 s5, s9, s13
; GFX1250-NEXT: v_mov_b32_e32 v0, s4
; GFX1250-NEXT: v_mov_b32_e32 v1, s5
; GFX1250-NEXT: v_mov_b32_e32 v2, s3
; GFX1250-NEXT: v_mov_b32_e32 v3, s2
; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp sle <4 x i32> %a, %b
%val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
store <4 x i32> %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 {
; EG-LABEL: s_test_imin_sle_i8:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T1.X, T0.X, 72, #3
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 108, #3
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_imin_sle_i8:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i8 s2, s2
; CI-NEXT: s_sext_i32_i8 s3, s3
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_byte v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_sle_i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x4c
; VI-NEXT: s_load_dword s3, s[8:9], 0x28
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i8 s2, s2
; VI-NEXT: s_sext_i32_i8 s3, s3
; VI-NEXT: s_min_i32 s2, s3, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_sle_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x4c
; GFX9-NEXT: s_load_dword s3, s[8:9], 0x28
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i8 s2, s2
; GFX9-NEXT: s_sext_i32_i8 s3, s3
; GFX9-NEXT: s_min_i32 s2, s3, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_sle_i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dword s2, s[8:9], 0x4c
; GFX10-NEXT: s_load_dword s3, s[8:9], 0x28
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sext_i32_i8 s2, s2
; GFX10-NEXT: s_sext_i32_i8 s3, s3
; GFX10-NEXT: s_min_i32 s2, s3, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_sle_i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x4c
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x28
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sext_i32_i8 s2, s2
; GFX11-NEXT: s_sext_i32_i8 s3, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_min_i32 s2, s3, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imin_sle_i8:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_clause 0x2
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4c
; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x28
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_sext_i32_i8 s2, s2
; GFX1250-NEXT: s_sext_i32_i8 s3, s3
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_min_i32 s2, s3, s2
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp sle i8 %a, %b
%val = select i1 %cmp, i8 %a, i8 %b
store i8 %val, ptr addrspace(1) %out
ret void
}
; FIXME: Why vector and sdwa for last element?
define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 {
; EG-LABEL: s_test_imin_sle_v4i8:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @22, KC0[], KC1[]
; EG-NEXT: TEX 7 @6
; EG-NEXT: ALU 30, @23, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T5.X, T4.X, 74, #3
; EG-NEXT: VTX_READ_8 T6.X, T4.X, 108, #3
; EG-NEXT: VTX_READ_8 T7.X, T4.X, 72, #3
; EG-NEXT: VTX_READ_8 T8.X, T4.X, 111, #3
; EG-NEXT: VTX_READ_8 T9.X, T4.X, 75, #3
; EG-NEXT: VTX_READ_8 T10.X, T4.X, 109, #3
; EG-NEXT: VTX_READ_8 T11.X, T4.X, 73, #3
; EG-NEXT: VTX_READ_8 T4.X, T4.X, 110, #3
; EG-NEXT: ALU clause starting at 22:
; EG-NEXT: MOV * T4.X, 0.0,
; EG-NEXT: ALU clause starting at 23:
; EG-NEXT: BFE_INT T0.Z, T5.X, 0.0, literal.x,
; EG-NEXT: BFE_INT * T0.W, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T4.X, T11.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T0.Y, T10.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: BFE_INT * T1.Z, T9.X, 0.0, literal.x, BS:VEC_201
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T1.W, T8.X, 0.0, literal.x,
; EG-NEXT: MIN_INT * T0.W, T0.Z, T0.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: MIN_INT T0.Z, T1.Z, PV.W,
; EG-NEXT: AND_INT T0.W, PS, literal.x,
; EG-NEXT: MIN_INT * T1.W, T4.X, T0.Y,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: AND_INT T4.X, PS, literal.x,
; EG-NEXT: LSHL T0.Y, PV.W, literal.y,
; EG-NEXT: BFE_INT T1.Z, T7.X, 0.0, literal.z,
; EG-NEXT: BFE_INT T0.W, T6.X, 0.0, literal.z, BS:VEC_120/SCL_212
; EG-NEXT: LSHL * T1.W, PV.Z, literal.w,
; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
; EG-NEXT: MIN_INT T0.Z, PV.Z, PV.W,
; EG-NEXT: OR_INT T0.W, PS, PV.Y,
; EG-NEXT: LSHL * T1.W, PV.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT T0.W, PV.W, PS,
; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: OR_INT T4.X, PV.W, PS,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_imin_sle_v4i8:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s4, s2, 24
; CI-NEXT: s_sext_i32_i8 s5, s2
; CI-NEXT: s_bfe_i32 s6, s2, 0x80008
; CI-NEXT: s_bfe_i32 s2, s2, 0x80010
; CI-NEXT: s_ashr_i32 s7, s3, 24
; CI-NEXT: s_sext_i32_i8 s8, s3
; CI-NEXT: s_bfe_i32 s9, s3, 0x80008
; CI-NEXT: s_bfe_i32 s3, s3, 0x80010
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: s_min_i32 s4, s4, s7
; CI-NEXT: s_and_b32 s2, s2, 0xff
; CI-NEXT: s_lshl_b32 s4, s4, 24
; CI-NEXT: s_lshl_b32 s2, s2, 16
; CI-NEXT: s_or_b32 s2, s4, s2
; CI-NEXT: s_min_i32 s3, s6, s9
; CI-NEXT: s_min_i32 s4, s5, s8
; CI-NEXT: s_lshl_b32 s3, s3, 8
; CI-NEXT: s_and_b32 s4, s4, 0xff
; CI-NEXT: s_or_b32 s3, s4, s3
; CI-NEXT: s_and_b32 s3, s3, 0xffff
; CI-NEXT: s_or_b32 s2, s3, s2
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_sle_v4i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s4, s2, 24
; VI-NEXT: s_ashr_i32 s6, s3, 24
; VI-NEXT: s_min_i32 s4, s4, s6
; VI-NEXT: s_bfe_i32 s6, s3, 0x80010
; VI-NEXT: s_bfe_i32 s8, s2, 0x80010
; VI-NEXT: s_min_i32 s6, s8, s6
; VI-NEXT: s_sext_i32_i16 s5, s2
; VI-NEXT: s_sext_i32_i16 s7, s3
; VI-NEXT: s_lshl_b32 s4, s4, 8
; VI-NEXT: s_and_b32 s6, s6, 0xff
; VI-NEXT: s_or_b32 s4, s6, s4
; VI-NEXT: s_ashr_i32 s6, s7, 8
; VI-NEXT: s_ashr_i32 s5, s5, 8
; VI-NEXT: s_sext_i32_i8 s3, s3
; VI-NEXT: s_sext_i32_i8 s2, s2
; VI-NEXT: s_min_i32 s5, s5, s6
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: s_lshl_b32 s5, s5, 8
; VI-NEXT: s_and_b32 s2, s2, 0xff
; VI-NEXT: s_or_b32 s2, s2, s5
; VI-NEXT: s_lshl_b32 s4, s4, 16
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_or_b32 s2, s2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_sle_v4i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28
; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i16 s5, s2
; GFX9-NEXT: s_sext_i32_i16 s7, s3
; GFX9-NEXT: s_ashr_i32 s7, s7, 8
; GFX9-NEXT: s_ashr_i32 s5, s5, 8
; GFX9-NEXT: s_ashr_i32 s4, s2, 24
; GFX9-NEXT: s_ashr_i32 s6, s3, 24
; GFX9-NEXT: s_min_i32 s5, s5, s7
; GFX9-NEXT: s_sext_i32_i8 s7, s3
; GFX9-NEXT: s_sext_i32_i8 s8, s2
; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80010
; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80010
; GFX9-NEXT: s_min_i32 s7, s8, s7
; GFX9-NEXT: s_min_i32 s4, s4, s6
; GFX9-NEXT: s_min_i32 s2, s2, s3
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
; GFX9-NEXT: s_and_b32 s7, s7, 0xff
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_and_b32 s2, s2, 0xff
; GFX9-NEXT: s_or_b32 s5, s7, s5
; GFX9-NEXT: s_or_b32 s2, s2, s4
; GFX9-NEXT: s_and_b32 s5, s5, 0xffff
; GFX9-NEXT: s_lshl_b32 s2, s2, 16
; GFX9-NEXT: s_or_b32 s2, s5, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_sle_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28
; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sext_i32_i16 s5, s2
; GFX10-NEXT: s_sext_i32_i16 s7, s3
; GFX10-NEXT: s_ashr_i32 s4, s2, 24
; GFX10-NEXT: s_ashr_i32 s6, s3, 24
; GFX10-NEXT: s_sext_i32_i8 s8, s3
; GFX10-NEXT: s_sext_i32_i8 s9, s2
; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80010
; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80010
; GFX10-NEXT: s_ashr_i32 s7, s7, 8
; GFX10-NEXT: s_ashr_i32 s5, s5, 8
; GFX10-NEXT: s_min_i32 s8, s9, s8
; GFX10-NEXT: s_min_i32 s4, s4, s6
; GFX10-NEXT: s_min_i32 s2, s2, s3
; GFX10-NEXT: s_min_i32 s3, s5, s7
; GFX10-NEXT: s_and_b32 s5, s8, 0xff
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
; GFX10-NEXT: s_and_b32 s2, s2, 0xff
; GFX10-NEXT: s_or_b32 s3, s5, s3
; GFX10-NEXT: s_or_b32 s2, s2, s4
; GFX10-NEXT: s_and_b32 s3, s3, 0xffff
; GFX10-NEXT: s_lshl_b32 s2, s2, 16
; GFX10-NEXT: s_or_b32 s2, s3, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_sle_v4i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sext_i32_i16 s5, s2
; GFX11-NEXT: s_sext_i32_i16 s7, s3
; GFX11-NEXT: s_ashr_i32 s4, s2, 24
; GFX11-NEXT: s_ashr_i32 s6, s3, 24
; GFX11-NEXT: s_sext_i32_i8 s8, s3
; GFX11-NEXT: s_sext_i32_i8 s9, s2
; GFX11-NEXT: s_bfe_i32 s3, s3, 0x80010
; GFX11-NEXT: s_bfe_i32 s2, s2, 0x80010
; GFX11-NEXT: s_ashr_i32 s7, s7, 8
; GFX11-NEXT: s_ashr_i32 s5, s5, 8
; GFX11-NEXT: s_min_i32 s8, s9, s8
; GFX11-NEXT: s_min_i32 s4, s4, s6
; GFX11-NEXT: s_min_i32 s2, s2, s3
; GFX11-NEXT: s_min_i32 s3, s5, s7
; GFX11-NEXT: s_and_b32 s5, s8, 0xff
; GFX11-NEXT: s_lshl_b32 s4, s4, 8
; GFX11-NEXT: s_lshl_b32 s3, s3, 8
; GFX11-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-NEXT: s_or_b32 s3, s5, s3
; GFX11-NEXT: s_or_b32 s2, s2, s4
; GFX11-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-NEXT: s_lshl_b32 s2, s2, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s2, s3, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imin_sle_v4i8:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_clause 0x2
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x28
; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x4c
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_sext_i32_i16 s5, s2
; GFX1250-NEXT: s_sext_i32_i16 s7, s3
; GFX1250-NEXT: s_ashr_i32 s4, s2, 24
; GFX1250-NEXT: s_ashr_i32 s6, s3, 24
; GFX1250-NEXT: s_sext_i32_i8 s8, s3
; GFX1250-NEXT: s_sext_i32_i8 s9, s2
; GFX1250-NEXT: s_bfe_i32 s3, s3, 0x80010
; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x80010
; GFX1250-NEXT: s_ashr_i32 s7, s7, 8
; GFX1250-NEXT: s_ashr_i32 s5, s5, 8
; GFX1250-NEXT: s_min_i32 s8, s9, s8
; GFX1250-NEXT: s_min_i32 s4, s4, s6
; GFX1250-NEXT: s_min_i32 s2, s2, s3
; GFX1250-NEXT: s_min_i32 s3, s5, s7
; GFX1250-NEXT: s_and_b32 s5, s8, 0xff
; GFX1250-NEXT: s_lshl_b32 s4, s4, 8
; GFX1250-NEXT: s_lshl_b32 s3, s3, 8
; GFX1250-NEXT: s_and_b32 s2, s2, 0xff
; GFX1250-NEXT: s_or_b32 s3, s5, s3
; GFX1250-NEXT: s_or_b32 s2, s2, s4
; GFX1250-NEXT: s_and_b32 s3, s3, 0xffff
; GFX1250-NEXT: s_lshl_b32 s2, s2, 16
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_or_b32 s2, s3, s2
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp sle <4 x i8> %a, %b
%val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
store <4 x i8> %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #0 {
; EG-LABEL: s_test_imin_sle_v2i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @14, KC0[], KC1[]
; EG-NEXT: TEX 3 @6
; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3
; EG-NEXT: VTX_READ_16 T6.X, T4.X, 44, #3
; EG-NEXT: VTX_READ_16 T7.X, T4.X, 40, #3
; EG-NEXT: VTX_READ_16 T4.X, T4.X, 46, #3
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MOV * T4.X, 0.0,
; EG-NEXT: ALU clause starting at 15:
; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T0.Y, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: BFE_INT * T0.Z, T7.X, 0.0, literal.x, BS:VEC_201
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT * T0.W, T6.X, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: MIN_INT T0.W, T0.Z, PV.W,
; EG-NEXT: MIN_INT * T1.W, T5.X, T0.Y,
; EG-NEXT: LSHL T1.W, PS, literal.x,
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT T4.X, PV.W, PS,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_imin_sle_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s4, s2, 16
; CI-NEXT: s_sext_i32_i16 s2, s2
; CI-NEXT: s_ashr_i32 s5, s3, 16
; CI-NEXT: s_sext_i32_i16 s3, s3
; CI-NEXT: s_min_i32 s4, s4, s5
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: s_lshl_b32 s3, s4, 16
; CI-NEXT: s_and_b32 s2, s2, 0xffff
; CI-NEXT: s_or_b32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_sle_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s4, s3, 16
; VI-NEXT: s_ashr_i32 s5, s2, 16
; VI-NEXT: s_sext_i32_i16 s3, s3
; VI-NEXT: s_sext_i32_i16 s2, s2
; VI-NEXT: s_min_i32 s4, s5, s4
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: s_lshl_b32 s3, s4, 16
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_sle_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_pk_min_i16 v1, s2, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_sle_v2i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_pk_min_i16 v1, s2, s3
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_sle_v2i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_min_i16 v1, s2, s3
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imin_sle_v2i16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_pk_min_i16 v1, s2, s3
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp sle <2 x i16> %a, %b
%val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
store <2 x i16> %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16> %a, <4 x i16> %b) #0 {
; EG-LABEL: s_test_imin_sle_v4i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @28, KC0[], KC1[]
; EG-NEXT: TEX 1 @12
; EG-NEXT: ALU 9, @30, KC0[], KC1[]
; EG-NEXT: TEX 1 @16
; EG-NEXT: ALU 10, @40, KC0[], KC1[]
; EG-NEXT: TEX 1 @20
; EG-NEXT: ALU 10, @51, KC0[], KC1[]
; EG-NEXT: TEX 1 @24
; EG-NEXT: ALU 11, @62, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XY, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 12:
; EG-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3
; EG-NEXT: VTX_READ_16 T7.X, T5.X, 58, #3
; EG-NEXT: Fetch clause starting at 16:
; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3
; EG-NEXT: VTX_READ_16 T7.X, T5.X, 56, #3
; EG-NEXT: Fetch clause starting at 20:
; EG-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3
; EG-NEXT: VTX_READ_16 T7.X, T5.X, 54, #3
; EG-NEXT: Fetch clause starting at 24:
; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
; EG-NEXT: VTX_READ_16 T5.X, T5.X, 52, #3
; EG-NEXT: ALU clause starting at 28:
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: MOV * T5.X, 0.0,
; EG-NEXT: ALU clause starting at 30:
; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV * T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, PV.X,
; EG-NEXT: ALU clause starting at 40:
; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: ALU clause starting at 51:
; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, PV.X,
; EG-NEXT: ALU clause starting at 62:
; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
; EG-NEXT: BFE_INT * T0.W, T5.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
; EG-NEXT: AND_INT T1.W, T0.Y, literal.y,
; EG-NEXT: AND_INT * T0.W, PV.W, literal.z,
; EG-NEXT: 2(2.802597e-45), -65536(nan)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T6.X, PV.W, PS,
; EG-NEXT: MOV T2.X, PV.X,
; EG-NEXT: MOV * T6.Y, T3.X,
;
; CI-LABEL: s_test_imin_sle_v4i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s6, s0, 16
; CI-NEXT: s_ashr_i32 s7, s1, 16
; CI-NEXT: s_sext_i32_i16 s0, s0
; CI-NEXT: s_sext_i32_i16 s1, s1
; CI-NEXT: s_ashr_i32 s8, s2, 16
; CI-NEXT: s_ashr_i32 s9, s3, 16
; CI-NEXT: s_sext_i32_i16 s2, s2
; CI-NEXT: s_sext_i32_i16 s3, s3
; CI-NEXT: s_min_i32 s7, s7, s9
; CI-NEXT: s_min_i32 s1, s1, s3
; CI-NEXT: s_min_i32 s3, s6, s8
; CI-NEXT: s_min_i32 s0, s0, s2
; CI-NEXT: s_lshl_b32 s7, s7, 16
; CI-NEXT: s_and_b32 s1, s1, 0xffff
; CI-NEXT: s_lshl_b32 s3, s3, 16
; CI-NEXT: s_and_b32 s0, s0, 0xffff
; CI-NEXT: s_or_b32 s1, s1, s7
; CI-NEXT: s_or_b32 s0, s0, s3
; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_sle_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s6, s3, 16
; VI-NEXT: s_ashr_i32 s7, s1, 16
; VI-NEXT: s_sext_i32_i16 s3, s3
; VI-NEXT: s_sext_i32_i16 s1, s1
; VI-NEXT: s_min_i32 s6, s7, s6
; VI-NEXT: s_min_i32 s1, s1, s3
; VI-NEXT: s_lshl_b32 s6, s6, 16
; VI-NEXT: s_and_b32 s1, s1, 0xffff
; VI-NEXT: s_or_b32 s1, s1, s6
; VI-NEXT: s_ashr_i32 s3, s2, 16
; VI-NEXT: s_ashr_i32 s6, s0, 16
; VI-NEXT: s_sext_i32_i16 s2, s2
; VI-NEXT: s_sext_i32_i16 s0, s0
; VI-NEXT: s_min_i32 s3, s6, s3
; VI-NEXT: s_min_i32 s0, s0, s2
; VI-NEXT: s_lshl_b32 s3, s3, 16
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_or_b32 s0, s0, s3
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_sle_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_pk_min_i16 v1, s1, v0
; GFX9-NEXT: v_pk_min_i16 v0, s0, v3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_sle_v4i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_pk_min_i16 v1, s1, s3
; GFX10-NEXT: v_pk_min_i16 v0, s0, s2
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_sle_v4i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_min_i16 v1, s1, s3
; GFX11-NEXT: v_pk_min_i16 v0, s0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imin_sle_v4i16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_pk_min_i16 v1, s1, s3
; GFX1250-NEXT: v_pk_min_i16 v0, s0, s2
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX1250-NEXT: s_endpgm
%cmp = icmp sle <4 x i16> %a, %b
%val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
store <4 x i16> %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
; EG-LABEL: v_test_imin_slt_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MIN_INT T0.X, T0.X, T1.X,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_imin_slt_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_load_dword v5, v[0:1]
; CI-NEXT: flat_load_dword v2, v[2:3]
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_min_i32_e32 v2, v5, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: v_test_imin_slt_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_min_i32_e32 v2, v5, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imin_slt_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_i32_e32 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_imin_slt_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_min_i32_e32 v1, v1, v2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_imin_slt_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_min_i32_e32 v1, v1, v2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: v_test_imin_slt_i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_min_i32_e32 v1, v1, v2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
%out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
%a = load i32, ptr addrspace(1) %a.gep, align 4
%b = load i32, ptr addrspace(1) %b.gep, align 4
%cmp = icmp slt i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
store i32 %val, ptr addrspace(1) %out.gep, align 4
ret void
}
define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
; EG-LABEL: v_test_imin_slt_i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @10
; EG-NEXT: ALU 16, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: Fetch clause starting at 10:
; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: LSHL * T0.W, T0.X, 1,
; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W,
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, T0.W,
; EG-NEXT: ALU clause starting at 15:
; EG-NEXT: BFE_INT T0.Z, T0.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T1.W, T1.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PS, literal.x,
; EG-NEXT: MIN_INT * T1.W, PV.W, PV.Z,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PS, literal.x,
; EG-NEXT: LSHL * T2.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; EG-NEXT: LSHL T1.X, PV.W, PS,
; EG-NEXT: LSHL * T1.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T1.Y, 0.0,
; EG-NEXT: MOV * T1.Z, 0.0,
; EG-NEXT: LSHR * T0.X, T0.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_imin_slt_i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_load_sshort v5, v[0:1]
; CI-NEXT: flat_load_sshort v2, v[2:3]
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_min_i32_e32 v2, v5, v2
; CI-NEXT: flat_store_short v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: v_test_imin_slt_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_min_i16_e32 v2, v5, v2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imin_slt_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: global_load_ushort v2, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_i16_e32 v1, v1, v2
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_imin_slt_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-NEXT: global_load_ushort v2, v0, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_min_i16 v1, v1, v2
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: v_test_imin_slt_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_test_imin_slt_i16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_min_i16 v1, v1, v2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
; GFX1250-LABEL: v_test_imin_slt_i16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_u16 v1, v0, s[2:3] scale_offset
; GFX1250-NEXT: global_load_u16 v2, v0, s[4:5] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_min_i16 v1, v1, v2
; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset
; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr inbounds i16, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr inbounds i16, ptr addrspace(1) %bptr, i32 %tid
%out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
%a = load i16, ptr addrspace(1) %a.gep
%b = load i16, ptr addrspace(1) %b.gep
%cmp = icmp slt i16 %a, %b
%val = select i1 %cmp, i16 %a, i16 %b
store i16 %val, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
; EG-LABEL: s_test_imin_slt_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_imin_slt_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_slt_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_slt_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_min_i32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_slt_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_min_i32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_slt_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_min_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imin_slt_i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_min_i32 s2, s2, s3
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp slt i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
store i32 %val, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
; EG-LABEL: s_test_imin_slt_v2i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MIN_INT * T0.Y, KC0[3].X, KC0[3].Z,
; EG-NEXT: MIN_INT * T0.X, KC0[2].W, KC0[3].Y,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_imin_slt_v2i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s1, s1, s3
; CI-NEXT: s_min_i32 s0, s0, s2
; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_slt_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s1, s1, s3
; VI-NEXT: s_min_i32 s0, s0, s2
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_slt_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_min_i32 s1, s1, s3
; GFX9-NEXT: s_min_i32 s0, s0, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_slt_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_min_i32 s0, s0, s2
; GFX10-NEXT: s_min_i32 s1, s1, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_slt_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_min_i32 s0, s0, s2
; GFX11-NEXT: s_min_i32 s1, s1, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_mov_b32_e32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imin_slt_v2i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_min_i32 s0, s0, s2
; GFX1250-NEXT: s_min_i32 s1, s1, s3
; GFX1250-NEXT: v_mov_b32_e32 v0, s0
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX1250-NEXT: s_endpgm
%cmp = icmp slt <2 x i32> %a, %b
%val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
store <2 x i32> %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a) #0 {
; EG-LABEL: s_test_imin_slt_imm_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; CI-LABEL: s_test_imin_slt_imm_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, 8
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_slt_imm_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_slt_imm_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_min_i32 s2, s2, 8
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_slt_imm_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_min_i32 s2, s2, 8
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_slt_imm_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_min_i32 s2, s2, 8
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imin_slt_imm_i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_min_i32 s2, s2, 8
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp slt i32 %a, 8
%val = select i1 %cmp, i32 %a, i32 8
store i32 %val, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a) #0 {
; EG-LABEL: s_test_imin_sle_imm_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; CI-LABEL: s_test_imin_sle_imm_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, 8
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_sle_imm_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_sle_imm_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_min_i32 s2, s2, 8
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_sle_imm_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_min_i32 s2, s2, 8
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_sle_imm_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_min_i32 s2, s2, 8
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imin_sle_imm_i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_min_i32 s2, s2, 8
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp sle i32 %a, 8
%val = select i1 %cmp, i32 %a, i32 8
store i32 %val, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; EG-LABEL: v_test_umin_ule_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MIN_UINT T0.X, T0.X, T1.X,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_umin_ule_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_load_dword v5, v[0:1]
; CI-NEXT: flat_load_dword v2, v[2:3]
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_min_u32_e32 v2, v5, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: v_test_umin_ule_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_min_u32_e32 v2, v5, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umin_ule_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_u32_e32 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_umin_ule_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_min_u32_e32 v1, v1, v2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_umin_ule_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_min_u32_e32 v1, v1, v2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: v_test_umin_ule_i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_min_u32_e32 v1, v1, v2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
%b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
%out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
%a = load i32, ptr addrspace(1) %a.gep, align 4
%b = load i32, ptr addrspace(1) %b.gep, align 4
%cmp = icmp ule i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
store i32 %val, ptr addrspace(1) %out.gep, align 4
ret void
}
define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; EG-LABEL: v_test_umin_ule_v3i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 9, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1
; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MIN_UINT * T0.Y, T2.Y, T1.Y,
; EG-NEXT: MIN_UINT T0.X, T2.X, T1.X,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
; EG-NEXT: LSHR T1.X, PV.W, literal.x,
; EG-NEXT: MIN_UINT * T2.X, T2.Z, T1.Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_umin_ule_v3i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v6
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v2, s5
; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v6
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; CI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
; CI-NEXT: v_mov_b32_e32 v7, s1
; CI-NEXT: v_add_i32_e32 v6, vcc, s0, v6
; CI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_min_u32_e32 v2, v2, v5
; CI-NEXT: v_min_u32_e32 v1, v1, v4
; CI-NEXT: v_min_u32_e32 v0, v0, v3
; CI-NEXT: flat_store_dwordx3 v[6:7], v[0:2]
; CI-NEXT: s_endpgm
;
; VI-LABEL: v_test_umin_ule_v3i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v6
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; VI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_add_u32_e32 v6, vcc, s0, v6
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_min_u32_e32 v2, v2, v5
; VI-NEXT: v_min_u32_e32 v1, v1, v4
; VI-NEXT: v_min_u32_e32 v0, v0, v3
; VI-NEXT: flat_store_dwordx3 v[6:7], v[0:2]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umin_ule_v3i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 4, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3]
; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_u32_e32 v2, v2, v5
; GFX9-NEXT: v_min_u32_e32 v1, v1, v4
; GFX9-NEXT: v_min_u32_e32 v0, v0, v3
; GFX9-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_umin_ule_v3i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 4, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3]
; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_min_u32_e32 v2, v2, v5
; GFX10-NEXT: v_min_u32_e32 v1, v1, v4
; GFX10-NEXT: v_min_u32_e32 v0, v0, v3
; GFX10-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_umin_ule_v3i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b96 v[0:2], v6, s[2:3]
; GFX11-NEXT: global_load_b96 v[3:5], v6, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_min_u32_e32 v2, v2, v5
; GFX11-NEXT: v_min_u32_e32 v1, v1, v4
; GFX11-NEXT: v_min_u32_e32 v0, v0, v3
; GFX11-NEXT: global_store_b96 v6, v[0:2], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: v_test_umin_ule_v3i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 4, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b96 v[0:2], v3, s[2:3]
; GFX1250-NEXT: global_load_b96 v[4:6], v3, s[4:5]
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_min_u32_e32 v2, v2, v6
; GFX1250-NEXT: v_min_u32_e32 v1, v1, v5
; GFX1250-NEXT: v_min_u32_e32 v0, v0, v4
; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[0:1]
; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %a.ptr, i32 %tid
%b.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %b.ptr, i32 %tid
%out.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %out, i32 %tid
%a = load <3 x i32>, ptr addrspace(1) %a.gep
%b = load <3 x i32>, ptr addrspace(1) %b.gep
%cmp = icmp ule <3 x i32> %a, %b
%val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
store <3 x i32> %val, ptr addrspace(1) %out.gep
ret void
}
; FIXME: Reduce unused packed component to scalar
define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; EG-LABEL: v_test_umin_ule_v3i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @20, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @8
; EG-NEXT: ALU 11, @24, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 3 @12
; EG-NEXT: ALU 8, @36, KC0[], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T8.X, 0
; EG-NEXT: MEM_RAT MSKOR T7.XW, T0.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_16 T7.X, T6.X, 4, #1
; EG-NEXT: VTX_READ_16 T8.X, T0.X, 4, #1
; EG-NEXT: Fetch clause starting at 12:
; EG-NEXT: VTX_READ_16 T8.X, T6.X, 0, #1
; EG-NEXT: VTX_READ_16 T9.X, T0.X, 0, #1
; EG-NEXT: VTX_READ_16 T6.X, T6.X, 2, #1
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1
; EG-NEXT: ALU clause starting at 20:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ADD_INT * T6.X, KC0[2].W, PV.W,
; EG-NEXT: ALU clause starting at 24:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT * T2.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T2.W, PV.W, literal.x,
; EG-NEXT: MIN_UINT * T3.W, T8.X, T7.X,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T7.X, PS, PV.W,
; EG-NEXT: LSHL * T7.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV * T7.Y, 0.0,
; EG-NEXT: ALU clause starting at 36:
; EG-NEXT: MOV T7.Z, 0.0,
; EG-NEXT: MIN_UINT * T2.W, T0.X, T6.X,
; EG-NEXT: LSHR T0.X, T1.W, literal.x,
; EG-NEXT: LSHL T1.W, PV.W, literal.y,
; EG-NEXT: MIN_UINT * T2.W, T9.X, T8.X,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: OR_INT T6.X, PV.W, PS,
; EG-NEXT: LSHR * T8.X, T0.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_umin_ule_v3i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; CI-NEXT: v_add_i32_e32 v6, vcc, 4, v4
; CI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v2
; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; CI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; CI-NEXT: v_min_u32_e32 v0, v0, v2
; CI-NEXT: v_min_u32_e32 v2, v8, v9
; CI-NEXT: v_min_u32_e32 v1, v1, v3
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_or_b32_e32 v0, v0, v2
; CI-NEXT: flat_store_short v[6:7], v1
; CI-NEXT: flat_store_dword v[4:5], v0
; CI-NEXT: s_endpgm
;
; VI-LABEL: v_test_umin_ule_v3i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v4
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_min_u16_e32 v8, v0, v2
; VI-NEXT: v_min_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_min_u16_e32 v1, v1, v3
; VI-NEXT: v_or_b32_e32 v0, v8, v0
; VI-NEXT: flat_store_short v[6:7], v1
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umin_ule_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_min_u16 v1, v1, v3
; GFX9-NEXT: v_pk_min_u16 v0, v0, v2
; GFX9-NEXT: global_store_short v4, v1, s[0:1] offset:4
; GFX9-NEXT: global_store_dword v4, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_umin_ule_v3i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_min_u16 v1, v1, v3
; GFX10-NEXT: v_pk_min_u16 v0, v0, v2
; GFX10-NEXT: global_store_short v4, v1, s[0:1] offset:4
; GFX10-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_umin_ule_v3i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3]
; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_min_u16 v1, v1, v3
; GFX11-NEXT: v_pk_min_u16 v0, v0, v2
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b16 v4, v1, s[0:1] offset:4
; GFX11-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: v_test_umin_ule_v3i16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[2:3] scale_offset
; GFX1250-NEXT: global_load_b64 v[2:3], v4, s[4:5] scale_offset
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_pk_min_u16 v1, v1, v3
; GFX1250-NEXT: v_pk_min_u16 v0, v0, v2
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b16 v4, v1, s[0:1] offset:4
; GFX1250-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
%b.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
%out.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %out, i32 %tid
%a = load <3 x i16>, ptr addrspace(1) %a.gep
%b = load <3 x i16>, ptr addrspace(1) %b.gep
%cmp = icmp ule <3 x i16> %a, %b
%val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
store <3 x i16> %val, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
; EG-LABEL: s_test_umin_ule_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_umin_ule_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_umin_ule_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_umin_ule_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_min_u32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_umin_ule_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_min_u32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_umin_ule_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_min_u32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_umin_ule_i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_min_u32 s2, s2, s3
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp ule i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
store i32 %val, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; EG-LABEL: v_test_umin_ult_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MIN_UINT T0.X, T0.X, T1.X,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_umin_ult_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_load_dword v5, v[0:1]
; CI-NEXT: flat_load_dword v2, v[2:3]
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_min_u32_e32 v2, v5, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: v_test_umin_ult_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_min_u32_e32 v2, v5, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umin_ult_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_u32_e32 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_umin_ult_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_min_u32_e32 v1, v1, v2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_umin_ult_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_min_u32_e32 v1, v1, v2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: v_test_umin_ult_i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_min_u32_e32 v1, v1, v2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
%b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
%out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
%a = load i32, ptr addrspace(1) %a.gep, align 4
%b = load i32, ptr addrspace(1) %b.gep, align 4
%cmp = icmp ult i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
store i32 %val, ptr addrspace(1) %out.gep, align 4
ret void
}
define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; EG-LABEL: v_test_umin_ult_i8:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T2.X, T2.X, 0, #1
; EG-NEXT: VTX_READ_8 T1.X, T1.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: ADD_INT T1.X, KC0[2].Z, T0.X,
; EG-NEXT: ADD_INT * T2.X, KC0[2].W, T0.X,
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.X,
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
; EG-NEXT: MIN_UINT * T2.W, T1.X, T2.X,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T1.X, T2.W, PV.W,
; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T1.Y, 0.0,
; EG-NEXT: MOV * T1.Z, 0.0,
; EG-NEXT: LSHR * T0.X, T0.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_umin_ult_i8:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s3
; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
; CI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; CI-NEXT: v_mov_b32_e32 v4, s5
; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v0
; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; CI-NEXT: flat_load_ubyte v2, v[1:2]
; CI-NEXT: flat_load_ubyte v3, v[3:4]
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_min_u32_e32 v2, v2, v3
; CI-NEXT: flat_store_byte v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: v_test_umin_ult_i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; VI-NEXT: flat_load_ubyte v2, v[1:2]
; VI-NEXT: flat_load_ubyte v3, v[3:4]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_min_u16_e32 v2, v2, v3
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umin_ult_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX9-NEXT: global_load_ubyte v2, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_u16_e32 v1, v1, v2
; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_umin_ult_i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX10-NEXT: global_load_ubyte v2, v0, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_min_u16 v1, v1, v2
; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: v_test_umin_ult_i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_d16_u8 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: global_load_d16_hi_u8 v0, v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: global_store_b8 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_test_umin_ult_i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_u8 v1, v0, s[2:3]
; GFX11-FAKE16-NEXT: global_load_u8 v2, v0, s[4:5]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_min_u16 v1, v1, v2
; GFX11-FAKE16-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
; GFX1250-LABEL: v_test_umin_ult_i8:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3]
; GFX1250-NEXT: global_load_u8 v2, v0, s[4:5]
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_min_u16 v1, v1, v2
; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr inbounds i8, ptr addrspace(1) %a.ptr, i32 %tid
%b.gep = getelementptr inbounds i8, ptr addrspace(1) %b.ptr, i32 %tid
%out.gep = getelementptr inbounds i8, ptr addrspace(1) %out, i32 %tid
%a = load i8, ptr addrspace(1) %a.gep, align 1
%b = load i8, ptr addrspace(1) %b.gep, align 1
%cmp = icmp ult i8 %a, %b
%val = select i1 %cmp, i8 %a, i8 %b
store i8 %val, ptr addrspace(1) %out.gep, align 1
ret void
}
define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
; EG-LABEL: s_test_umin_ult_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_umin_ult_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_umin_ult_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_umin_ult_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_min_u32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_umin_ult_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_min_u32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_umin_ult_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_min_u32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_umin_ult_i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_min_u32 s2, s2, s3
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp ult i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
store i32 %val, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
; EG-LABEL: v_test_umin_ult_i32_multi_use:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 16, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 0
; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV T0.X, KC0[2].W,
; EG-NEXT: MOV * T1.X, KC0[3].X,
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x,
; EG-NEXT: SETGT_UINT * T1.W, T1.X, T0.X,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PS, 1,
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T2.X, PV.W, PS,
; EG-NEXT: LSHL * T2.W, literal.x, PS,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T2.Y, 0.0,
; EG-NEXT: MOV * T2.Z, 0.0,
; EG-NEXT: LSHR T3.X, KC0[2].Z, literal.x,
; EG-NEXT: SETGE_UINT * T0.W, T0.X, T1.X,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T0.X, PV.W, T0.X, T1.X,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_umin_ult_i32_multi_use:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-NEXT: s_load_dword s5, s[6:7], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_lt_u32 s4, s5
; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
; CI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CI-NEXT: s_cselect_b32 s0, s4, s5
; CI-NEXT: v_mov_b32_e32 v5, s0
; CI-NEXT: flat_store_dword v[0:1], v5
; CI-NEXT: flat_store_byte v[2:3], v4
; CI-NEXT: s_endpgm
;
; VI-LABEL: v_test_umin_ult_i32_multi_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s4, s[4:5], 0x0
; VI-NEXT: s_load_dword s5, s[6:7], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lt_u32 s4, s5
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
; VI-NEXT: s_cselect_b32 s0, s4, s5
; VI-NEXT: v_mov_b32_e32 v5, s0
; VI-NEXT: flat_store_dword v[0:1], v5
; VI-NEXT: flat_store_byte v[2:3], v4
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umin_ult_i32_multi_use:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lt_u32 s8, s9
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX9-NEXT: s_cselect_b32 s4, s8, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_umin_ult_i32_multi_use:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX10-NEXT: s_load_dword s9, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_lt_u32 s8, s9
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
; GFX10-NEXT: s_cselect_b32 s4, s8, s9
; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
; GFX10-NEXT: global_store_byte v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_umin_ult_i32_multi_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
; GFX11-NEXT: s_load_b32 s5, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lt_u32 s4, s5
; GFX11-NEXT: s_cselect_b32 s6, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s4, s4, s5
; GFX11-NEXT: v_mov_b32_e32 v2, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: v_test_umin_ult_i32_multi_use:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b32 s5, s[6:7], 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_cmp_lt_u32 s4, s5
; GFX1250-NEXT: s_cselect_b32 s6, -1, 0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
; GFX1250-NEXT: s_and_b32 s6, s6, exec_lo
; GFX1250-NEXT: s_cselect_b32 s4, s4, s5
; GFX1250-NEXT: v_mov_b32_e32 v2, s4
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b32 v1, v2, s[0:1]
; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX1250-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %aptr, align 4
%b = load i32, ptr addrspace(1) %bptr, align 4
%cmp = icmp ult i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
store i32 %val, ptr addrspace(1) %out0, align 4
store i1 %cmp, ptr addrspace(1) %out1
ret void
}
define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
; EG-LABEL: v_test_umin_ult_i16_multi_use:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 24, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV T0.X, KC0[2].W,
; EG-NEXT: MOV * T1.X, KC0[3].X,
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
; EG-NEXT: SETGE_UINT * T1.W, T0.X, T1.X,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T1.W, PS, T0.X, T1.X,
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T2.X, PV.W, PS,
; EG-NEXT: LSHL * T2.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T2.Y, 0.0,
; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x,
; EG-NEXT: SETGT_UINT * T1.W, T1.X, T0.X,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PS, 1,
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV T2.Z, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR T1.X, KC0[2].Z, literal.x,
; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_umin_ult_i16_multi_use:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: v_mov_b32_e32 v2, s6
; CI-NEXT: v_mov_b32_e32 v3, s7
; CI-NEXT: flat_load_ushort v4, v[0:1]
; CI-NEXT: flat_load_ushort v5, v[2:3]
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5
; CI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
; CI-NEXT: flat_store_short v[0:1], v4
; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CI-NEXT: flat_store_byte v[2:3], v0
; CI-NEXT: s_endpgm
;
; VI-LABEL: v_test_umin_ult_i16_multi_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_ushort v4, v[2:3]
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_and_b32_e32 v6, 0xffff, v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v7, 0xffff, v5
; VI-NEXT: v_cmp_lt_u32_e32 vcc, v7, v6
; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; VI-NEXT: flat_store_short v[0:1], v4
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umin_ult_i16_multi_use:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_umin_ult_i16_multi_use:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX10-NEXT: global_load_ushort v2, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_lt_u32_sdwa vcc_lo, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: v_test_umin_ult_i16_multi_use:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v0, s[6:7]
; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v0, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_lt_u32_e32 vcc_lo, v4, v3
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: global_store_b8 v0, v2, s[2:3]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_test_umin_ult_i16_multi_use:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cmp_lt_u32_e32 vcc_lo, v4, v3
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: global_store_b8 v0, v2, s[2:3]
; GFX11-FAKE16-NEXT: s_endpgm
;
; GFX1250-LABEL: v_test_umin_ult_i16_multi_use:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX1250-NEXT: global_load_u16 v2, v0, s[4:5]
; GFX1250-NEXT: s_wait_loadcnt 0x1
; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, v4, v3
; GFX1250-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX1250-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3]
; GFX1250-NEXT: s_endpgm
%a = load i16, ptr addrspace(1) %aptr, align 2
%b = load i16, ptr addrspace(1) %bptr, align 2
%cmp = icmp ult i16 %a, %b
%val = select i1 %cmp, i16 %a, i16 %b
store i16 %val, ptr addrspace(1) %out0, align 2
store i1 %cmp, ptr addrspace(1) %out1
ret void
}
define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 {
; EG-LABEL: s_test_umin_ult_v1i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_umin_ult_v1i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_umin_ult_v1i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_umin_ult_v1i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_min_u32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_umin_ult_v1i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_min_u32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_umin_ult_v1i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_min_u32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_umin_ult_v1i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_min_u32 s2, s2, s3
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp ult <1 x i32> %a, %b
%val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
store <1 x i32> %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) #0 {
; EG-LABEL: s_test_umin_ult_v8i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MIN_UINT * T0.W, KC0[5].X, KC0[7].X,
; EG-NEXT: MIN_UINT * T0.Z, KC0[4].W, KC0[6].W,
; EG-NEXT: MIN_UINT * T0.Y, KC0[4].Z, KC0[6].Z,
; EG-NEXT: MIN_UINT * T0.X, KC0[4].Y, KC0[6].Y,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MIN_UINT * T2.W, KC0[6].X, KC0[8].X,
; EG-NEXT: MIN_UINT * T2.Z, KC0[5].W, KC0[7].W,
; EG-NEXT: MIN_UINT * T2.Y, KC0[5].Z, KC0[7].Z,
; EG-NEXT: MIN_UINT * T2.X, KC0[5].Y, KC0[7].Y,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_umin_ult_v8i32:
; CI: ; %bb.0:
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s4, s15, s23
; CI-NEXT: s_min_u32 s5, s14, s22
; CI-NEXT: s_min_u32 s6, s13, s21
; CI-NEXT: s_min_u32 s7, s12, s20
; CI-NEXT: s_min_u32 s2, s19, s27
; CI-NEXT: s_min_u32 s3, s18, s26
; CI-NEXT: s_min_u32 s8, s17, s25
; CI-NEXT: s_min_u32 s9, s16, s24
; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: s_add_u32 s2, s0, 16
; CI-NEXT: v_mov_b32_e32 v2, s3
; CI-NEXT: s_addc_u32 s3, s1, 0
; CI-NEXT: v_mov_b32_e32 v5, s3
; CI-NEXT: v_mov_b32_e32 v0, s9
; CI-NEXT: v_mov_b32_e32 v1, s8
; CI-NEXT: v_mov_b32_e32 v4, s2
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v0, s7
; CI-NEXT: v_mov_b32_e32 v1, s6
; CI-NEXT: v_mov_b32_e32 v2, s5
; CI-NEXT: v_mov_b32_e32 v3, s4
; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_umin_ult_v8i32:
; VI: ; %bb.0:
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s4, s15, s23
; VI-NEXT: s_min_u32 s5, s14, s22
; VI-NEXT: s_min_u32 s6, s13, s21
; VI-NEXT: s_min_u32 s7, s12, s20
; VI-NEXT: s_min_u32 s2, s19, s27
; VI-NEXT: s_min_u32 s3, s18, s26
; VI-NEXT: s_min_u32 s8, s17, s25
; VI-NEXT: s_min_u32 s9, s16, s24
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_mov_b32_e32 v1, s8
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_umin_ult_v8i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_min_u32 s6, s19, s27
; GFX9-NEXT: s_min_u32 s7, s18, s26
; GFX9-NEXT: s_min_u32 s8, s17, s25
; GFX9-NEXT: s_min_u32 s9, s16, s24
; GFX9-NEXT: s_min_u32 s2, s15, s23
; GFX9-NEXT: s_min_u32 s3, s14, s22
; GFX9-NEXT: s_min_u32 s4, s13, s21
; GFX9-NEXT: s_min_u32 s5, s12, s20
; GFX9-NEXT: v_mov_b32_e32 v0, s9
; GFX9-NEXT: v_mov_b32_e32 v1, s8
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_mov_b32_e32 v3, s6
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_umin_ult_v8i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_min_u32 s6, s19, s27
; GFX10-NEXT: s_min_u32 s7, s18, s26
; GFX10-NEXT: s_min_u32 s8, s16, s24
; GFX10-NEXT: s_min_u32 s9, s17, s25
; GFX10-NEXT: s_min_u32 s2, s15, s23
; GFX10-NEXT: s_min_u32 s3, s14, s22
; GFX10-NEXT: s_min_u32 s4, s13, s21
; GFX10-NEXT: s_min_u32 s5, s12, s20
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_mov_b32_e32 v2, s7
; GFX10-NEXT: v_mov_b32_e32 v3, s6
; GFX10-NEXT: v_mov_b32_e32 v4, s5
; GFX10-NEXT: v_mov_b32_e32 v5, s4
; GFX10-NEXT: v_mov_b32_e32 v6, s3
; GFX10-NEXT: v_mov_b32_e32 v7, s2
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_umin_ult_v8i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x20
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_min_u32 s4, s9, s17
; GFX11-NEXT: s_min_u32 s5, s8, s16
; GFX11-NEXT: s_min_u32 s6, s15, s23
; GFX11-NEXT: s_min_u32 s7, s14, s22
; GFX11-NEXT: s_min_u32 s8, s12, s20
; GFX11-NEXT: s_min_u32 s9, s13, s21
; GFX11-NEXT: s_min_u32 s2, s11, s19
; GFX11-NEXT: s_min_u32 s3, s10, s18
; GFX11-NEXT: v_mov_b32_e32 v0, s8
; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: v_mov_b32_e32 v2, s7
; GFX11-NEXT: v_mov_b32_e32 v3, s6
; GFX11-NEXT: v_mov_b32_e32 v4, s5
; GFX11-NEXT: v_mov_b32_e32 v5, s4
; GFX11-NEXT: v_mov_b32_e32 v6, s3
; GFX11-NEXT: v_mov_b32_e32 v7, s2
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_umin_ult_v8i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b512 s[8:23], s[4:5], 0x20
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v8, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_min_u32 s4, s9, s17
; GFX1250-NEXT: s_min_u32 s5, s8, s16
; GFX1250-NEXT: s_min_u32 s6, s15, s23
; GFX1250-NEXT: s_min_u32 s7, s14, s22
; GFX1250-NEXT: s_min_u32 s8, s12, s20
; GFX1250-NEXT: s_min_u32 s9, s13, s21
; GFX1250-NEXT: s_min_u32 s2, s11, s19
; GFX1250-NEXT: s_min_u32 s3, s10, s18
; GFX1250-NEXT: v_mov_b32_e32 v0, s8
; GFX1250-NEXT: v_mov_b32_e32 v1, s9
; GFX1250-NEXT: v_mov_b32_e32 v2, s7
; GFX1250-NEXT: v_mov_b32_e32 v3, s6
; GFX1250-NEXT: v_mov_b32_e32 v4, s5
; GFX1250-NEXT: v_mov_b32_e32 v5, s4
; GFX1250-NEXT: v_mov_b32_e32 v6, s3
; GFX1250-NEXT: v_mov_b32_e32 v7, s2
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp ult <8 x i32> %a, %b
%val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
store <8 x i32> %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16> %a, <8 x i16> %b) #0 {
; EG-LABEL: s_test_umin_ult_v8i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @52, KC0[], KC1[]
; EG-NEXT: TEX 1 @20
; EG-NEXT: ALU 9, @54, KC0[], KC1[]
; EG-NEXT: TEX 1 @24
; EG-NEXT: ALU 8, @64, KC0[], KC1[]
; EG-NEXT: TEX 1 @28
; EG-NEXT: ALU 10, @73, KC0[], KC1[]
; EG-NEXT: TEX 1 @32
; EG-NEXT: ALU 8, @84, KC0[], KC1[]
; EG-NEXT: TEX 1 @36
; EG-NEXT: ALU 10, @93, KC0[], KC1[]
; EG-NEXT: TEX 1 @40
; EG-NEXT: ALU 8, @104, KC0[], KC1[]
; EG-NEXT: TEX 1 @44
; EG-NEXT: ALU 10, @113, KC0[], KC1[]
; EG-NEXT: TEX 1 @48
; EG-NEXT: ALU 10, @124, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 20:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3
; EG-NEXT: VTX_READ_16 T9.X, T7.X, 82, #3
; EG-NEXT: Fetch clause starting at 24:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3
; EG-NEXT: VTX_READ_16 T9.X, T7.X, 80, #3
; EG-NEXT: Fetch clause starting at 28:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3
; EG-NEXT: VTX_READ_16 T9.X, T7.X, 78, #3
; EG-NEXT: Fetch clause starting at 32:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3
; EG-NEXT: VTX_READ_16 T9.X, T7.X, 76, #3
; EG-NEXT: Fetch clause starting at 36:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3
; EG-NEXT: VTX_READ_16 T9.X, T7.X, 74, #3
; EG-NEXT: Fetch clause starting at 40:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3
; EG-NEXT: VTX_READ_16 T9.X, T7.X, 72, #3
; EG-NEXT: Fetch clause starting at 44:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3
; EG-NEXT: VTX_READ_16 T9.X, T7.X, 70, #3
; EG-NEXT: Fetch clause starting at 48:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 52, #3
; EG-NEXT: VTX_READ_16 T7.X, T7.X, 68, #3
; EG-NEXT: ALU clause starting at 52:
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: MOV * T7.X, 0.0,
; EG-NEXT: ALU clause starting at 54:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV * T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, PV.X,
; EG-NEXT: ALU clause starting at 64:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: ALU clause starting at 73:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, PV.X,
; EG-NEXT: ALU clause starting at 84:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
; EG-NEXT: MOV T2.X, PV.Z,
; EG-NEXT: MOV * T0.Y, T5.X,
; EG-NEXT: ALU clause starting at 93:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, PV.X,
; EG-NEXT: ALU clause starting at 104:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T4.X,
; EG-NEXT: ALU clause starting at 113:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T4.X, PV.W,
; EG-NEXT: MOV * T0.Y, PV.X,
; EG-NEXT: ALU clause starting at 124:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T7.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
; EG-NEXT: AND_INT T2.W, T0.Y, literal.y,
; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
; EG-NEXT: 2(2.802597e-45), -65536(nan)
; EG-NEXT: OR_INT * T7.X, PV.W, PS,
; EG-NEXT: MOV T4.X, PV.X,
; EG-NEXT: MOV * T7.W, T3.X,
; EG-NEXT: MOV * T7.Y, T5.X,
;
; CI-LABEL: s_test_umin_ult_v8i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s10, s0, 16
; CI-NEXT: s_and_b32 s0, s0, 0xffff
; CI-NEXT: s_lshr_b32 s11, s1, 16
; CI-NEXT: s_and_b32 s1, s1, 0xffff
; CI-NEXT: s_lshr_b32 s12, s2, 16
; CI-NEXT: s_and_b32 s2, s2, 0xffff
; CI-NEXT: s_lshr_b32 s13, s3, 16
; CI-NEXT: s_and_b32 s3, s3, 0xffff
; CI-NEXT: s_lshr_b32 s14, s4, 16
; CI-NEXT: s_and_b32 s4, s4, 0xffff
; CI-NEXT: s_lshr_b32 s15, s5, 16
; CI-NEXT: s_and_b32 s5, s5, 0xffff
; CI-NEXT: s_lshr_b32 s16, s6, 16
; CI-NEXT: s_and_b32 s6, s6, 0xffff
; CI-NEXT: s_lshr_b32 s17, s7, 16
; CI-NEXT: s_and_b32 s7, s7, 0xffff
; CI-NEXT: s_min_u32 s3, s3, s7
; CI-NEXT: s_min_u32 s7, s13, s17
; CI-NEXT: s_min_u32 s2, s2, s6
; CI-NEXT: s_min_u32 s6, s12, s16
; CI-NEXT: s_min_u32 s1, s1, s5
; CI-NEXT: s_min_u32 s5, s11, s15
; CI-NEXT: s_min_u32 s0, s0, s4
; CI-NEXT: s_min_u32 s4, s10, s14
; CI-NEXT: s_lshl_b32 s7, s7, 16
; CI-NEXT: s_lshl_b32 s6, s6, 16
; CI-NEXT: s_lshl_b32 s5, s5, 16
; CI-NEXT: s_lshl_b32 s4, s4, 16
; CI-NEXT: s_or_b32 s3, s3, s7
; CI-NEXT: s_or_b32 s2, s2, s6
; CI-NEXT: s_or_b32 s1, s1, s5
; CI-NEXT: s_or_b32 s0, s0, s4
; CI-NEXT: v_mov_b32_e32 v4, s8
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: v_mov_b32_e32 v5, s9
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_umin_ult_v8i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s10, s7, 0xffff
; VI-NEXT: s_and_b32 s11, s3, 0xffff
; VI-NEXT: s_lshr_b32 s7, s7, 16
; VI-NEXT: s_lshr_b32 s3, s3, 16
; VI-NEXT: s_min_u32 s3, s3, s7
; VI-NEXT: s_min_u32 s10, s11, s10
; VI-NEXT: s_lshl_b32 s3, s3, 16
; VI-NEXT: s_or_b32 s3, s10, s3
; VI-NEXT: s_and_b32 s7, s6, 0xffff
; VI-NEXT: s_and_b32 s10, s2, 0xffff
; VI-NEXT: s_lshr_b32 s6, s6, 16
; VI-NEXT: s_lshr_b32 s2, s2, 16
; VI-NEXT: s_min_u32 s2, s2, s6
; VI-NEXT: s_min_u32 s7, s10, s7
; VI-NEXT: s_lshl_b32 s2, s2, 16
; VI-NEXT: s_or_b32 s2, s7, s2
; VI-NEXT: s_and_b32 s6, s5, 0xffff
; VI-NEXT: s_and_b32 s7, s1, 0xffff
; VI-NEXT: s_lshr_b32 s5, s5, 16
; VI-NEXT: s_lshr_b32 s1, s1, 16
; VI-NEXT: s_min_u32 s1, s1, s5
; VI-NEXT: s_min_u32 s6, s7, s6
; VI-NEXT: s_lshl_b32 s1, s1, 16
; VI-NEXT: s_or_b32 s1, s6, s1
; VI-NEXT: s_and_b32 s5, s4, 0xffff
; VI-NEXT: s_and_b32 s6, s0, 0xffff
; VI-NEXT: s_lshr_b32 s4, s4, 16
; VI-NEXT: s_lshr_b32 s0, s0, 16
; VI-NEXT: s_min_u32 s0, s0, s4
; VI-NEXT: s_min_u32 s5, s6, s5
; VI-NEXT: s_lshl_b32 s0, s0, 16
; VI-NEXT: s_or_b32 s0, s5, s0
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_umin_ult_v8i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: v_pk_min_u16 v3, s3, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NEXT: v_pk_min_u16 v2, s2, v1
; GFX9-NEXT: v_pk_min_u16 v1, s1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_pk_min_u16 v0, s0, v0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_umin_ult_v8i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; GFX10-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_pk_min_u16 v3, s3, s7
; GFX10-NEXT: v_pk_min_u16 v2, s2, s6
; GFX10-NEXT: v_pk_min_u16 v1, s1, s5
; GFX10-NEXT: v_pk_min_u16 v0, s0, s4
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_umin_ult_v8i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x10
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_min_u16 v3, s11, s15
; GFX11-NEXT: v_pk_min_u16 v2, s10, s14
; GFX11-NEXT: v_pk_min_u16 v1, s9, s13
; GFX11-NEXT: v_pk_min_u16 v0, s8, s12
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_umin_ult_v8i16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x10
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v4, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_pk_min_u16 v3, s11, s15
; GFX1250-NEXT: v_pk_min_u16 v2, s10, s14
; GFX1250-NEXT: v_pk_min_u16 v1, s9, s13
; GFX1250-NEXT: v_pk_min_u16 v0, s8, s12
; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp ult <8 x i16> %a, %b
%val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
store <8 x i16> %val, ptr addrspace(1) %out
ret void
}
; Make sure redundant and removed
define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
; EG-LABEL: simplify_demanded_bits_test_umin_ult_i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T1.X, T0.X, 72, #3
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 108, #3
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: MIN_UINT T0.X, PV.Z, PV.W,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: simplify_demanded_bits_test_umin_ult_i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0xffff
; CI-NEXT: s_and_b32 s3, s3, 0xffff
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: simplify_demanded_bits_test_umin_ult_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: simplify_demanded_bits_test_umin_ult_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28
; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
; GFX9-NEXT: s_min_u32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: simplify_demanded_bits_test_umin_ult_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28
; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
; GFX10-NEXT: s_and_b32 s3, s3, 0xffff
; GFX10-NEXT: s_min_u32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: simplify_demanded_bits_test_umin_ult_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_min_u32 s2, s2, s3
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: simplify_demanded_bits_test_umin_ult_i16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_clause 0x2
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x28
; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x4c
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_and_b32 s2, s2, 0xffff
; GFX1250-NEXT: s_and_b32 s3, s3, 0xffff
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_min_u32 s2, s2, s3
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%a.ext = zext i16 %a to i32
%b.ext = zext i16 %b to i32
%cmp = icmp ult i32 %a.ext, %b.ext
%val = select i1 %cmp, i32 %a.ext, i32 %b.ext
%mask = and i32 %val, 65535
store i32 %mask, ptr addrspace(1) %out
ret void
}
; Make sure redundant sign_extend_inreg removed.
define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace(1) %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 {
; EG-LABEL: simplify_demanded_bits_test_min_slt_i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T1.X, T0.X, 72, #3
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 108, #3
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: MIN_INT T0.X, PV.Z, PV.W,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: simplify_demanded_bits_test_min_slt_i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i16 s2, s2
; CI-NEXT: s_sext_i32_i16 s3, s3
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: simplify_demanded_bits_test_min_slt_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s2, s2
; VI-NEXT: s_sext_i32_i16 s3, s3
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: simplify_demanded_bits_test_min_slt_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28
; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i16 s2, s2
; GFX9-NEXT: s_sext_i32_i16 s3, s3
; GFX9-NEXT: s_min_i32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: simplify_demanded_bits_test_min_slt_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28
; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sext_i32_i16 s2, s2
; GFX10-NEXT: s_sext_i32_i16 s3, s3
; GFX10-NEXT: s_min_i32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: simplify_demanded_bits_test_min_slt_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sext_i32_i16 s2, s2
; GFX11-NEXT: s_sext_i32_i16 s3, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_min_i32 s2, s2, s3
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: simplify_demanded_bits_test_min_slt_i16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_clause 0x2
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x28
; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x4c
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_sext_i32_i16 s2, s2
; GFX1250-NEXT: s_sext_i32_i16 s3, s3
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_min_i32 s2, s2, s3
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%a.ext = sext i16 %a to i32
%b.ext = sext i16 %b to i32
%cmp = icmp slt i32 %a.ext, %b.ext
%val = select i1 %cmp, i32 %a.ext, i32 %b.ext
%shl = shl i32 %val, 16
%sextinreg = ashr i32 %shl, 16
store i32 %sextinreg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i16 %b) #0 {
; EG-LABEL: s_test_imin_sle_i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 42, #3
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45)
; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_imin_sle_i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i16 s3, s2
; CI-NEXT: s_ashr_i32 s2, s2, 16
; CI-NEXT: s_min_i32 s2, s3, s2
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_short v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_sle_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s3, s2, 16
; VI-NEXT: s_sext_i32_i16 s2, s2
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_sle_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s3, s2, 16
; GFX9-NEXT: s_sext_i32_i16 s2, s2
; GFX9-NEXT: s_min_i32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_sle_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_ashr_i32 s3, s2, 16
; GFX10-NEXT: s_sext_i32_i16 s2, s2
; GFX10-NEXT: s_min_i32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_sle_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_ashr_i32 s3, s2, 16
; GFX11-NEXT: s_sext_i32_i16 s2, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_min_i32 s2, s2, s3
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imin_sle_i16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_ashr_i32 s3, s2, 16
; GFX1250-NEXT: s_sext_i32_i16 s2, s2
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_min_i32 s2, s2, s3
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX1250-NEXT: s_endpgm
%cmp = icmp sle i16 %a, %b
%val = select i1 %cmp, i16 %a, i16 %b
store i16 %val, ptr addrspace(1) %out
ret void
}
; 64 bit
define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
; EG-LABEL: test_umin_ult_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
; EG-NEXT: SETGT_UINT * T0.W, KC0[3].Z, KC0[3].X,
; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: test_umin_ult_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
; CI-NEXT: v_mov_b32_e32 v2, s5
; CI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_and_b64 s[0:1], vcc, exec
; CI-NEXT: s_cselect_b32 s0, s3, s5
; CI-NEXT: s_cselect_b32 s1, s2, s4
; CI-NEXT: v_mov_b32_e32 v2, s1
; CI-NEXT: v_mov_b32_e32 v3, s0
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
;
; VI-LABEL: test_umin_ult_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_and_b64 s[0:1], vcc, exec
; VI-NEXT: s_cselect_b32 s0, s3, s5
; VI-NEXT: s_cselect_b32 s1, s2, s4
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umin_ult_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s3, s3, s5
; GFX9-NEXT: s_cselect_b32 s2, s2, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: test_umin_ult_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5]
; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
; GFX10-NEXT: s_cselect_b32 s2, s2, s4
; GFX10-NEXT: s_cselect_b32 s3, s3, s5
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_umin_ult_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5]
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, s4
; GFX11-NEXT: s_cselect_b32 s3, s3, s5
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: test_umin_ult_i64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_min_u64 v[0:1], s[2:3], s[4:5]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
%tmp = icmp ult i64 %a, %b
%val = select i1 %tmp, i64 %a, i64 %b
store i64 %val, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
; EG-LABEL: test_umin_ule_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
; EG-NEXT: SETGT_UINT * T0.W, KC0[3].Z, KC0[3].X,
; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: test_umin_ule_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
; CI-NEXT: v_mov_b32_e32 v2, s5
; CI-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[1:2]
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_and_b64 s[0:1], vcc, exec
; CI-NEXT: s_cselect_b32 s0, s3, s5
; CI-NEXT: s_cselect_b32 s1, s2, s4
; CI-NEXT: v_mov_b32_e32 v2, s1
; CI-NEXT: v_mov_b32_e32 v3, s0
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
;
; VI-LABEL: test_umin_ule_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[1:2]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_and_b64 s[0:1], vcc, exec
; VI-NEXT: s_cselect_b32 s0, s3, s5
; VI-NEXT: s_cselect_b32 s1, s2, s4
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umin_ule_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[0:1]
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s3, s3, s5
; GFX9-NEXT: s_cselect_b32 s2, s2, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: test_umin_ule_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5]
; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
; GFX10-NEXT: s_cselect_b32 s2, s2, s4
; GFX10-NEXT: s_cselect_b32 s3, s3, s5
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_umin_ule_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5]
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, s4
; GFX11-NEXT: s_cselect_b32 s3, s3, s5
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: test_umin_ule_i64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_min_u64 v[0:1], s[2:3], s[4:5]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
%tmp = icmp ule i64 %a, %b
%val = select i1 %tmp, i64 %a, i64 %b
store i64 %val, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
; EG-LABEL: test_imin_slt_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
; EG-NEXT: SETGT_INT * T0.W, KC0[3].Z, KC0[3].X,
; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: test_imin_slt_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
; CI-NEXT: v_mov_b32_e32 v2, s5
; CI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_and_b64 s[0:1], vcc, exec
; CI-NEXT: s_cselect_b32 s0, s3, s5
; CI-NEXT: s_cselect_b32 s1, s2, s4
; CI-NEXT: v_mov_b32_e32 v2, s1
; CI-NEXT: v_mov_b32_e32 v3, s0
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
;
; VI-LABEL: test_imin_slt_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_and_b64 s[0:1], vcc, exec
; VI-NEXT: s_cselect_b32 s0, s3, s5
; VI-NEXT: s_cselect_b32 s1, s2, s4
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_imin_slt_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s3, s3, s5
; GFX9-NEXT: s_cselect_b32 s2, s2, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: test_imin_slt_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5]
; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
; GFX10-NEXT: s_cselect_b32 s2, s2, s4
; GFX10-NEXT: s_cselect_b32 s3, s3, s5
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_imin_slt_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5]
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, s4
; GFX11-NEXT: s_cselect_b32 s3, s3, s5
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: test_imin_slt_i64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_min_i64 v[0:1], s[2:3], s[4:5]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
%tmp = icmp slt i64 %a, %b
%val = select i1 %tmp, i64 %a, i64 %b
store i64 %val, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
; EG-LABEL: test_imin_sle_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
; EG-NEXT: SETGT_INT * T0.W, KC0[3].Z, KC0[3].X,
; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: test_imin_sle_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
; CI-NEXT: v_mov_b32_e32 v2, s5
; CI-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[1:2]
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_and_b64 s[0:1], vcc, exec
; CI-NEXT: s_cselect_b32 s0, s3, s5
; CI-NEXT: s_cselect_b32 s1, s2, s4
; CI-NEXT: v_mov_b32_e32 v2, s1
; CI-NEXT: v_mov_b32_e32 v3, s0
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
;
; VI-LABEL: test_imin_sle_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[1:2]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_and_b64 s[0:1], vcc, exec
; VI-NEXT: s_cselect_b32 s0, s3, s5
; VI-NEXT: s_cselect_b32 s1, s2, s4
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_imin_sle_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[0:1]
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s3, s3, s5
; GFX9-NEXT: s_cselect_b32 s2, s2, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: test_imin_sle_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5]
; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
; GFX10-NEXT: s_cselect_b32 s2, s2, s4
; GFX10-NEXT: s_cselect_b32 s3, s3, s5
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_imin_sle_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5]
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, s4
; GFX11-NEXT: s_cselect_b32 s3, s3, s5
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: test_imin_sle_i64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_min_i64 v[0:1], s[2:3], s[4:5]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
%tmp = icmp sle i64 %a, %b
%val = select i1 %tmp, i64 %a, i64 %b
store i64 %val, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; EG-LABEL: v_test_imin_sle_v2i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @10
; EG-NEXT: ALU 16, @16, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: Fetch clause starting at 10:
; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 15:
; EG-NEXT: ADD_INT * T7.X, KC0[2].W, T0.W,
; EG-NEXT: ALU clause starting at 16:
; EG-NEXT: LSHR T1.W, T0.X, literal.x,
; EG-NEXT: LSHR * T2.W, T7.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T8.X, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T0.Z, T7.X, 0.0, literal.x,
; EG-NEXT: BFE_INT * T1.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: MIN_INT T1.W, PV.W, PV.Z,
; EG-NEXT: MIN_INT * T2.W, PV.Y, PV.X,
; EG-NEXT: LSHL T2.W, PS, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT T0.X, PS, PV.W,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_imin_sle_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_load_dword v4, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_bfe_i32 v2, v4, 0, 16
; CI-NEXT: v_ashrrev_i32_e32 v4, 16, v4
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_bfe_i32 v5, v3, 0, 16
; CI-NEXT: v_ashrrev_i32_e32 v3, 16, v3
; CI-NEXT: v_min_i32_e32 v3, v4, v3
; CI-NEXT: v_min_i32_e32 v2, v2, v5
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: v_test_imin_sle_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_min_i16_e32 v3, v5, v2
; VI-NEXT: v_min_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imin_sle_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_min_i16 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_imin_sle_v2i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_min_i16 v1, v1, v2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_imin_sle_v2i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_min_i16 v1, v1, v2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: v_test_imin_sle_v2i16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_pk_min_i16 v1, v1, v2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
%b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
%out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
%a = load <2 x i16>, ptr addrspace(1) %a.gep
%b = load <2 x i16>, ptr addrspace(1) %b.gep
%cmp = icmp sle <2 x i16> %a, %b
%val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
store <2 x i16> %val, ptr addrspace(1) %out.gep
ret void
}
; FIXME: i16 min
define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; EG-LABEL: v_test_imin_ule_v2i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @10
; EG-NEXT: ALU 13, @16, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: Fetch clause starting at 10:
; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W,
; EG-NEXT: ALU clause starting at 15:
; EG-NEXT: ADD_INT * T7.X, KC0[2].Z, T0.W,
; EG-NEXT: ALU clause starting at 16:
; EG-NEXT: LSHR T1.W, T0.X, literal.x,
; EG-NEXT: LSHR * T2.W, T7.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.Z, T0.X, literal.x,
; EG-NEXT: AND_INT T3.W, T7.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: MIN_UINT * T1.W, PS, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL T1.W, PS, literal.x,
; EG-NEXT: MIN_UINT * T2.W, PV.W, PV.Z,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT T0.X, PS, PV.W,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_imin_ule_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_load_dword v4, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; CI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; CI-NEXT: v_min_u32_e32 v2, v2, v5
; CI-NEXT: v_min_u32_e32 v3, v4, v3
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: v_test_imin_ule_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_min_u16_e32 v3, v5, v2
; VI-NEXT: v_min_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imin_ule_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_min_u16 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_imin_ule_v2i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_min_u16 v1, v1, v2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_imin_ule_v2i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_min_u16 v1, v1, v2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: v_test_imin_ule_v2i16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_pk_min_u16 v1, v1, v2
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
%b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
%out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
%a = load <2 x i16>, ptr addrspace(1) %a.gep
%b = load <2 x i16>, ptr addrspace(1) %b.gep
%cmp = icmp ule <2 x i16> %a, %b
%val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
store <2 x i16> %val, ptr addrspace(1) %out.gep
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX1250-FAKE16: {{.*}}