Min-Yih Hsu 7ebbbd885f
[DAG] Always use stack to promote bitcast when the source is vector (#151065)
The optimization introduced by #125637 tried to avoid using stacks to
promote bitcast with vector result type. However, it wouldn't be correct
if the input type is vector. This patch limits that optimizations to
only scalar to vector bitcasts.
2025-08-02 15:32:10 -07:00

1619 lines
62 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG %s
declare i16 @llvm.ctpop.i16(i16) nounwind readnone
declare <2 x i16> @llvm.ctpop.v2i16(<2 x i16>) nounwind readnone
declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) nounwind {
; SI-LABEL: s_ctpop_i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s4, s6, 0xffff
; SI-NEXT: s_bcnt1_i32_b32 s4, s4
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_ctpop_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s6, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s4, s6, 0xffff
; VI-NEXT: s_bcnt1_i32_b32 s4, s4
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ctpop_i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T1.W, T0.X,
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
store i16 %ctpop, ptr addrspace(1) %out, align 4
ret void
}
; XXX - Why 0 in register?
define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctpop_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctpop_i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 11, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: LSHL * T0.W, T0.X, 1,
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T1.W, T0.X,
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
%val = load i16, ptr addrspace(1) %in.gep, align 4
%ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
store i16 %ctpop, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in0, ptr addrspace(1) noalias %in1) nounwind {
; SI-LABEL: v_ctpop_add_chain_i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s14, 0
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[12:13], s[2:3]
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b64 s[6:7], s[14:15]
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[12:15], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0
; SI-NEXT: v_bcnt_u32_b32_e32 v0, v2, v0
; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctpop_add_chain_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_ushort v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0
; VI-NEXT: v_bcnt_u32_b32 v0, v0, v1
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctpop_add_chain_i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @10
; EG-NEXT: ALU 16, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: Fetch clause starting at 10:
; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: LSHL * T0.W, T0.X, 1,
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: ADD_INT * T1.X, KC0[2].W, T0.W,
; EG-NEXT: ALU clause starting at 15:
; EG-NEXT: AND_INT T0.W, T0.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T1.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.Z, PS,
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT T0.W, PV.W, PV.Z,
; EG-NEXT: LSHL * T1.W, PS, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in0.gep = getelementptr i16, ptr addrspace(1) %in0, i32 %tid
%in1.gep = getelementptr i16, ptr addrspace(1) %in1, i32 %tid
%val0 = load volatile i16, ptr addrspace(1) %in0.gep, align 4
%val1 = load volatile i16, ptr addrspace(1) %in1.gep, align 4
%ctpop0 = call i16 @llvm.ctpop.i16(i16 %val0) nounwind readnone
%ctpop1 = call i16 @llvm.ctpop.i16(i16 %val1) nounwind readnone
%add = add i16 %ctpop0, %ctpop1
store i16 %add, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %sval) nounwind {
; SI-LABEL: v_ctpop_add_sgpr_i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dword s12, s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s12
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctpop_add_sgpr_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dword s4, s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctpop_add_sgpr_i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 0, @14, KC0[], KC1[]
; EG-NEXT: TEX 0 @10
; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: Fetch clause starting at 10:
; EG-NEXT: VTX_READ_16 T1.X, T1.X, 44, #3
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: LSHL * T0.W, T0.X, 1,
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MOV * T1.X, 0.0,
; EG-NEXT: ALU clause starting at 15:
; EG-NEXT: BCNT_INT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, PV.W, T1.X,
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
%val = load i16, ptr addrspace(1) %in.gep, align 4
%ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
%add = add i16 %ctpop, %sval
store i16 %add, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctpop_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctpop_v2i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: LSHR * T0.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T1.W, PS,
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT T0.X, PV.W, PS,
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid
%val = load <2 x i16>, ptr addrspace(1) %in.gep, align 8
%ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %val) nounwind readnone
store <2 x i16> %ctpop, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_or_b32_e32 v0, v2, v0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctpop_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0
; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0
; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0
; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: v_or_b32_e32 v0, v0, v3
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctpop_v4i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 37, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T8.XY, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV T0.Y, T4.X,
; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: AND_INT * T0.W, T8.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV * T4.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: LSHR * T0.W, T8.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV * T0.X, T5.X,
; EG-NEXT: AND_INT * T0.W, T8.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV * T5.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: LSHR * T0.W, T8.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: OR_INT * T8.Y, T1.W, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T5.X, PV.Y,
; EG-NEXT: MOV * T8.X, T4.X,
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid
%val = load <4 x i16>, ptr addrspace(1) %in.gep, align 16
%ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %val) nounwind readnone
store <4 x i16> %ctpop, ptr addrspace(1) %out, align 16
ret void
}
define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_v8i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v5, 0xffff, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v2
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v3
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v5, v5, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v4, v4, 0
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v3, v7, v3
; SI-NEXT: v_or_b32_e32 v2, v6, v2
; SI-NEXT: v_or_b32_e32 v1, v5, v1
; SI-NEXT: v_or_b32_e32 v0, v4, v0
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctpop_v8i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v0
; VI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0
; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0
; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0
; VI-NEXT: v_bcnt_u32_b32 v7, v7, 0
; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0
; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0
; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0
; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v3, v3, v4
; VI-NEXT: v_or_b32_e32 v2, v2, v5
; VI-NEXT: v_or_b32_e32 v1, v1, v6
; VI-NEXT: v_or_b32_e32 v0, v0, v7
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctpop_v8i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 73, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T12.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV T0.Y, T4.X,
; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: LSHR * T0.W, T12.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT * T0.W, PV.W,
; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV * T4.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: AND_INT * T0.W, T12.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV * T0.X, T5.X,
; EG-NEXT: LSHR * T0.W, T12.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T5.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: AND_INT * T0.W, T12.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.Y, PS, PV.W,
; EG-NEXT: MOV T5.X, PV.Y,
; EG-NEXT: MOV * T0.X, T8.X,
; EG-NEXT: LSHR * T0.W, T12.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T8.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: AND_INT * T0.W, T12.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T8.X, PV.W,
; EG-NEXT: MOV * T0.X, T9.X,
; EG-NEXT: LSHR * T0.W, T12.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T9.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: AND_INT * T0.W, T12.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: LSHR T12.X, KC0[2].Y, literal.x,
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T9.X, PV.W,
; EG-NEXT: MOV * T0.X, T4.X,
; EG-NEXT: MOV * T0.Z, T8.X,
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <8 x i16>, ptr addrspace(1) %in, i32 %tid
%val = load <8 x i16>, ptr addrspace(1) %in.gep, align 32
%ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %val) nounwind readnone
store <8 x i16> %ctpop, ptr addrspace(1) %out, align 32
ret void
}
define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_v16i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v5, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 offset:16
; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v8, 0xffff, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v9, 0xffff, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v10, 0xffff, v2
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: v_and_b32_e32 v11, 0xffff, v3
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v12, 0xffff, v4
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; SI-NEXT: v_and_b32_e32 v13, 0xffff, v5
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; SI-NEXT: v_and_b32_e32 v14, 0xffff, v6
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; SI-NEXT: v_and_b32_e32 v15, 0xffff, v7
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v5, v5, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v4, v4, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v15, v15, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v14, v14, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v13, v13, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v12, v12, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v11, v11, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v10, v10, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v9, v9, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v8, v8, 0
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v0
; SI-NEXT: v_or_b32_e32 v3, v15, v7
; SI-NEXT: v_or_b32_e32 v2, v14, v6
; SI-NEXT: v_or_b32_e32 v1, v13, v5
; SI-NEXT: v_or_b32_e32 v0, v12, v4
; SI-NEXT: v_or_b32_e32 v7, v11, v16
; SI-NEXT: v_or_b32_e32 v6, v10, v17
; SI-NEXT: v_or_b32_e32 v5, v9, v18
; SI-NEXT: v_or_b32_e32 v4, v8, v19
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctpop_v16i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3
; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v0
; VI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7
; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6
; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5
; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4
; VI-NEXT: v_bcnt_u32_b32 v8, v8, 0
; VI-NEXT: v_bcnt_u32_b32 v9, v9, 0
; VI-NEXT: v_bcnt_u32_b32 v10, v10, 0
; VI-NEXT: v_bcnt_u32_b32 v11, v11, 0
; VI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; VI-NEXT: v_and_b32_e32 v6, 0xffff, v6
; VI-NEXT: v_and_b32_e32 v5, 0xffff, v5
; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0
; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0
; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0
; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0
; VI-NEXT: v_bcnt_u32_b32 v12, v12, 0
; VI-NEXT: v_bcnt_u32_b32 v13, v13, 0
; VI-NEXT: v_bcnt_u32_b32 v14, v14, 0
; VI-NEXT: v_bcnt_u32_b32 v15, v15, 0
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; VI-NEXT: v_bcnt_u32_b32 v7, v7, 0
; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0
; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0
; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0
; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_or_b32_e32 v3, v3, v8
; VI-NEXT: v_or_b32_e32 v2, v2, v9
; VI-NEXT: v_or_b32_e32 v1, v1, v10
; VI-NEXT: v_or_b32_e32 v0, v0, v11
; VI-NEXT: v_or_b32_e32 v7, v7, v12
; VI-NEXT: v_or_b32_e32 v6, v6, v13
; VI-NEXT: v_or_b32_e32 v5, v5, v14
; VI-NEXT: v_or_b32_e32 v4, v4, v15
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctpop_v16i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @8
; EG-NEXT: ALU 114, @16, KC0[], KC1[]
; EG-NEXT: ALU 34, @131, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T22.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_128 T20.XYZW, T0.X, 16, #1
; EG-NEXT: VTX_READ_128 T21.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: MOV T0.Y, T4.X,
; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 16:
; EG-NEXT: LSHR * T0.W, T20.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT * T0.W, PV.W,
; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV * T4.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: AND_INT * T0.W, T20.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV * T0.X, T5.X,
; EG-NEXT: LSHR * T0.W, T20.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T5.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: AND_INT * T0.W, T20.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.Y, PS, PV.W,
; EG-NEXT: MOV T5.X, PV.Y,
; EG-NEXT: MOV * T0.X, T8.X,
; EG-NEXT: LSHR * T0.W, T20.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T8.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: AND_INT * T0.W, T20.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T8.X, PV.W,
; EG-NEXT: MOV * T0.X, T9.X,
; EG-NEXT: LSHR * T0.W, T20.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV * T9.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: AND_INT * T0.W, T20.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T9.X, PV.W,
; EG-NEXT: MOV * T0.X, T12.X,
; EG-NEXT: LSHR * T1.W, T21.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T1.W, PV.W,
; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
; EG-NEXT: MOV * T12.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: AND_INT * T1.W, T21.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T1.W, PV.W,
; EG-NEXT: AND_INT * T2.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, PS, PV.W,
; EG-NEXT: MOV T12.X, PV.W,
; EG-NEXT: MOV * T0.X, T13.X,
; EG-NEXT: LSHR * T1.W, T21.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T1.W, PV.W,
; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
; EG-NEXT: MOV * T13.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: AND_INT * T1.W, T21.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T1.W, PV.W,
; EG-NEXT: AND_INT * T2.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T20.Y, PS, PV.W,
; EG-NEXT: MOV T13.X, PV.Y,
; EG-NEXT: MOV * T0.X, T16.X,
; EG-NEXT: LSHR * T1.W, T21.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T1.W, PV.W,
; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
; EG-NEXT: ALU clause starting at 131:
; EG-NEXT: MOV * T16.X, T1.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: AND_INT * T1.W, T21.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T1.W, PV.W,
; EG-NEXT: AND_INT * T2.W, PV.X, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, PS, PV.W,
; EG-NEXT: MOV T16.X, PV.W,
; EG-NEXT: MOV * T0.X, T17.X,
; EG-NEXT: LSHR * T1.W, T21.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T1.W, PV.W,
; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
; EG-NEXT: MOV * T17.X, PV.W,
; EG-NEXT: MOV T0.X, PV.X,
; EG-NEXT: AND_INT T1.W, T21.W, literal.x,
; EG-NEXT: LSHR * T21.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
; EG-NEXT: AND_INT T0.Z, PV.X, literal.x,
; EG-NEXT: BCNT_INT T1.W, PV.W,
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: -65536(nan), 16(2.242078e-44)
; EG-NEXT: LSHR T22.X, PS, literal.x,
; EG-NEXT: OR_INT * T20.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T17.X, PV.W,
; EG-NEXT: MOV * T0.X, T4.X,
; EG-NEXT: MOV * T0.Z, T8.X,
; EG-NEXT: MOV T20.X, T12.X,
; EG-NEXT: MOV * T20.Z, T16.X, BS:VEC_120/SCL_212
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <16 x i16>, ptr addrspace(1) %in, i32 %tid
%val = load <16 x i16>, ptr addrspace(1) %in.gep, align 32
%ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %val) nounwind readnone
store <16 x i16> %ctpop, ptr addrspace(1) %out, align 32
ret void
}
define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_i16_add_inline_constant:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 4
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctpop_i16_add_inline_constant:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bcnt_u32_b32 v0, v0, 4
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctpop_i16_add_inline_constant:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 12, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: LSHL * T0.W, T0.X, 1,
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: BCNT_INT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
; EG-NEXT: LSHL * T1.W, PS, literal.y,
; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
%val = load i16, ptr addrspace(1) %in.gep, align 4
%ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
%add = add i16 %ctpop, 4
store i16 %add, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_i16_add_inline_constant_inv:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 4
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctpop_i16_add_inline_constant_inv:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bcnt_u32_b32 v0, v0, 4
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctpop_i16_add_inline_constant_inv:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 12, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: LSHL * T0.W, T0.X, 1,
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: BCNT_INT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
; EG-NEXT: LSHL * T1.W, PS, literal.y,
; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
%val = load i16, ptr addrspace(1) %in.gep, align 4
%ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
%add = add i16 4, %ctpop
store i16 %add, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_i16_add_literal:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_movk_i32 s0, 0x3e7
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctpop_i16_add_literal:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_movk_i32 s4, 0x3e7
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctpop_i16_add_literal:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 12, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: LSHL * T0.W, T0.X, 1,
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: BCNT_INT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
; EG-NEXT: LSHL * T1.W, PS, literal.y,
; EG-NEXT: 999(1.399897e-42), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
%val = load i16, ptr addrspace(1) %in.gep, align 4
%ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
%add = add i16 %ctpop, 999
store i16 %add, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind {
; SI-LABEL: v_ctpop_i16_add_var:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dword s12, s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s12
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctpop_i16_add_var:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dword s4, s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctpop_i16_add_var:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 0, @14, KC0[], KC1[]
; EG-NEXT: TEX 0 @10
; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: Fetch clause starting at 10:
; EG-NEXT: VTX_READ_16 T1.X, T1.X, 44, #3
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: LSHL * T0.W, T0.X, 1,
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MOV * T1.X, 0.0,
; EG-NEXT: ALU clause starting at 15:
; EG-NEXT: BCNT_INT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, PV.W, T1.X,
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
%val = load i16, ptr addrspace(1) %in.gep, align 4
%ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
%add = add i16 %ctpop, %const
store i16 %add, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind {
; SI-LABEL: v_ctpop_i16_add_var_inv:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dword s12, s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s12
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctpop_i16_add_var_inv:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dword s4, s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctpop_i16_add_var_inv:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 0, @14, KC0[], KC1[]
; EG-NEXT: TEX 0 @10
; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: Fetch clause starting at 10:
; EG-NEXT: VTX_READ_16 T1.X, T1.X, 44, #3
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: LSHL * T0.W, T0.X, 1,
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MOV * T1.X, 0.0,
; EG-NEXT: ALU clause starting at 15:
; EG-NEXT: BCNT_INT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, T1.X, PV.W,
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
%val = load i16, ptr addrspace(1) %in.gep, align 4
%ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
%add = add i16 %const, %ctpop
store i16 %add, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %constptr) nounwind {
; SI-LABEL: v_ctpop_i16_add_vvar_inv:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s14, 0
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[12:13], s[2:3]
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b64 s[6:7], s[14:15]
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[12:15], 0 addr64
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bcnt_u32_b32_e32 v0, v2, v0
; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctpop_i16_add_vvar_inv:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bcnt_u32_b32 v0, v3, v0
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctpop_i16_add_vvar_inv:
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @10
; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: Fetch clause starting at 10:
; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: LSHL * T0.W, T0.X, 1,
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: ADD_INT * T1.X, KC0[2].W, T0.W,
; EG-NEXT: ALU clause starting at 15:
; EG-NEXT: BCNT_INT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, T1.X, PV.W,
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
%val = load i16, ptr addrspace(1) %in.gep, align 4
%ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
%gep = getelementptr i16, ptr addrspace(1) %constptr, i32 %tid
%const = load i16, ptr addrspace(1) %gep, align 4
%add = add i16 %const, %ctpop
store i16 %add, ptr addrspace(1) %out, align 4
ret void
}
; FIXME: We currently disallow SALU instructions in all branches,
; but there are some cases when the should be allowed.
define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %ctpop_arg, i16 %cond) {
; SI-LABEL: ctpop_i16_in_br:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s6, s[4:5], 0xd
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s4, s6, 16
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: s_cbranch_scc0 .LBB14_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s8, s2
; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2
; SI-NEXT: s_cbranch_execnz .LBB14_3
; SI-NEXT: .LBB14_2: ; %if
; SI-NEXT: s_and_b32 s2, s6, 0xffff
; SI-NEXT: s_bcnt1_i32_b32 s2, s2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: .LBB14_3: ; %endif
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
; SI-NEXT: .LBB14_4:
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_branch .LBB14_2
;
; VI-LABEL: ctpop_i16_in_br:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s6, s[4:5], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s6, 16
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: s_cbranch_scc0 .LBB14_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s8, s2
; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2
; VI-NEXT: s_cbranch_execnz .LBB14_3
; VI-NEXT: .LBB14_2: ; %if
; VI-NEXT: s_and_b32 s2, s6, 0xffff
; VI-NEXT: s_bcnt1_i32_b32 s2, s2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: .LBB14_3: ; %endif
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB14_4:
; VI-NEXT: ; implicit-def: $vgpr0
; VI-NEXT: s_branch .LBB14_2
;
; EG-LABEL: ctpop_i16_in_br:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @20, KC0[], KC1[]
; EG-NEXT: TEX 0 @14
; EG-NEXT: ALU_PUSH_BEFORE 3, @21, KC0[], KC1[]
; EG-NEXT: JUMP @7 POP:1
; EG-NEXT: ALU 0, @25, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @16
; EG-NEXT: ALU_POP_AFTER 1, @26, KC0[], KC1[]
; EG-NEXT: ALU_PUSH_BEFORE 2, @28, KC0[CB0:0-32], KC1[]
; EG-NEXT: JUMP @11 POP:1
; EG-NEXT: TEX 0 @18
; EG-NEXT: ALU_POP_AFTER 0, @31, KC0[], KC1[]
; EG-NEXT: ALU 11, @32, KC0[], KC1[]
; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 14:
; EG-NEXT: VTX_READ_16 T1.X, T0.X, 46, #3
; EG-NEXT: Fetch clause starting at 16:
; EG-NEXT: VTX_READ_16 T1.X, T1.X, 2, #1
; EG-NEXT: Fetch clause starting at 18:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3
; EG-NEXT: ALU clause starting at 20:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 21:
; EG-NEXT: MOV T1.W, literal.x,
; EG-NEXT: SETNE_INT * T0.W, T1.X, 0.0,
; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
; EG-NEXT: ALU clause starting at 25:
; EG-NEXT: MOV * T1.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 26:
; EG-NEXT: MOV * T1.W, literal.x,
; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
; EG-NEXT: ALU clause starting at 28:
; EG-NEXT: MOV T0.W, KC0[2].Y,
; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0,
; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
; EG-NEXT: ALU clause starting at 31:
; EG-NEXT: BCNT_INT * T1.X, T0.X,
; EG-NEXT: ALU clause starting at 32:
; EG-NEXT: LSHL * T1.W, T0.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T2.W, T1.X, literal.y,
; EG-NEXT: 24(3.363116e-44), 65535(9.183409e-41)
; EG-NEXT: LSHL T1.X, PS, PV.W,
; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T1.Y, 0.0,
; EG-NEXT: MOV * T1.Z, 0.0,
; EG-NEXT: LSHR * T0.X, T0.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%tmp0 = icmp eq i16 %cond, 0
br i1 %tmp0, label %if, label %else
if:
%tmp2 = call i16 @llvm.ctpop.i16(i16 %ctpop_arg)
br label %endif
else:
%tmp3 = getelementptr i16, ptr addrspace(1) %in, i16 1
%tmp4 = load i16, ptr addrspace(1) %tmp3
br label %endif
endif:
%tmp5 = phi i16 [%tmp2, %if], [%tmp4, %else]
store i16 %tmp5, ptr addrspace(1) %out
ret void
}