Use the default, which freely coalesces anything it can. This mostly shows improvements, with a handful of regressions. The main concern would be if introducing wider registers is more likely to push the register usage up to the next occupancy tier.
1399 lines
62 KiB
LLVM
1399 lines
62 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
|
|
|
; Disabled endcf collapse at -O0.
|
|
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN-O0 %s
|
|
|
|
; Note: Breaking large PHIs is disabled to branches from being eliminated (in scc_liveness)
|
|
|
|
define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
|
|
; GCN-LABEL: simple_nested_if:
|
|
; GCN: ; %bb.0: ; %bb
|
|
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
|
|
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
|
|
; GCN-NEXT: s_cbranch_execz .LBB0_3
|
|
; GCN-NEXT: ; %bb.1: ; %bb.outer.then
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, 0
|
|
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64
|
|
; GCN-NEXT: s_and_b64 exec, exec, vcc
|
|
; GCN-NEXT: s_cbranch_execz .LBB0_3
|
|
; GCN-NEXT: ; %bb.2: ; %bb.inner.then
|
|
; GCN-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v2, s1
|
|
; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1
|
|
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
|
|
; GCN-NEXT: s_mov_b32 s0, s2
|
|
; GCN-NEXT: s_mov_b32 s1, s2
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 1
|
|
; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:4
|
|
; GCN-NEXT: .LBB0_3: ; %bb.outer.end
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 3
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NEXT: s_mov_b32 m0, -1
|
|
; GCN-NEXT: ds_write_b32 v1, v0
|
|
; GCN-NEXT: s_endpgm
|
|
;
|
|
; GCN-O0-LABEL: simple_nested_if:
|
|
; GCN-O0: ; %bb.0: ; %bb
|
|
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN-O0-NEXT: s_mov_b32 s14, -1
|
|
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
|
|
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
|
|
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s0, 0
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s1, 1
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 1
|
|
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0
|
|
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s0, 2
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s1, 3
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB0_4
|
|
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 0
|
|
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 1
|
|
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 0
|
|
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
|
|
; GCN-O0-NEXT: s_mov_b32 s1, s2
|
|
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
|
|
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 2
|
|
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[1:2], s0
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[4:7], 0 addr64
|
|
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v0, s0
|
|
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s0, 4
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s1, 5
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB0_3
|
|
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
|
|
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
|
|
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
|
|
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 2
|
|
; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
|
|
; GCN-O0-NEXT: s_mov_b32 s4, 0
|
|
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
|
|
; GCN-O0-NEXT: s_mov_b32 s5, s2
|
|
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
|
|
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
|
|
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
|
|
; GCN-O0-NEXT: .LBB0_3: ; %Flow
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 4
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 5
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN-O0-NEXT: .LBB0_4: ; %bb.outer.end
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 2
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 3
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, 3
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
|
|
; GCN-O0-NEXT: s_mov_b32 m0, -1
|
|
; GCN-O0-NEXT: ds_write_b32 v0, v1
|
|
; GCN-O0-NEXT: s_endpgm
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp1 = icmp ugt i32 %tmp, 1
|
|
br i1 %tmp1, label %bb.outer.then, label %bb.outer.end
|
|
|
|
bb.outer.then: ; preds = %bb
|
|
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
|
|
store i32 0, ptr addrspace(1) %tmp4, align 4
|
|
%tmp5 = icmp eq i32 %tmp, 2
|
|
br i1 %tmp5, label %bb.outer.end, label %bb.inner.then
|
|
|
|
bb.inner.then: ; preds = %bb.outer.then
|
|
%tmp7 = add i32 %tmp, 1
|
|
%tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
|
|
store i32 1, ptr addrspace(1) %tmp9, align 4
|
|
br label %bb.outer.end
|
|
|
|
bb.outer.end: ; preds = %bb.outer.then, %bb.inner.then, %bb
|
|
store i32 3, ptr addrspace(3) null
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %arg) {
|
|
; GCN-LABEL: uncollapsable_nested_if:
|
|
; GCN: ; %bb.0: ; %bb
|
|
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
|
|
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
|
|
; GCN-NEXT: s_cbranch_execz .LBB1_4
|
|
; GCN-NEXT: ; %bb.1: ; %bb.outer.then
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
|
|
; GCN-NEXT: v_mov_b32_e32 v4, 0
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, 0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v2, s1
|
|
; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3
|
|
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
|
|
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
|
|
; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64
|
|
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GCN-NEXT: s_cbranch_execz .LBB1_3
|
|
; GCN-NEXT: ; %bb.2: ; %bb.inner.then
|
|
; GCN-NEXT: s_mov_b32 s0, s2
|
|
; GCN-NEXT: s_mov_b32 s1, s2
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 1
|
|
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:4
|
|
; GCN-NEXT: .LBB1_3: ; %bb.inner.end
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN-NEXT: s_mov_b32 s0, s2
|
|
; GCN-NEXT: s_mov_b32 s1, s2
|
|
; GCN-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 2
|
|
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
|
|
; GCN-NEXT: .LBB1_4: ; %Flow
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GCN-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 3
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NEXT: s_mov_b32 m0, -1
|
|
; GCN-NEXT: ds_write_b32 v1, v0
|
|
; GCN-NEXT: s_endpgm
|
|
;
|
|
; GCN-O0-LABEL: uncollapsable_nested_if:
|
|
; GCN-O0: ; %bb.0: ; %bb
|
|
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN-O0-NEXT: s_mov_b32 s14, -1
|
|
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
|
|
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
|
|
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s0, 0
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s1, 1
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 1
|
|
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0
|
|
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s0, 2
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s1, 3
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB1_3
|
|
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 0
|
|
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 1
|
|
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 0
|
|
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
|
|
; GCN-O0-NEXT: s_mov_b32 s1, s2
|
|
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
|
|
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 2
|
|
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[1:2], s0
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[4:7], 0 addr64
|
|
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v0, s0
|
|
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s0, 4
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s1, 5
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB1_4
|
|
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
|
|
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
|
|
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
|
|
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 2
|
|
; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
|
|
; GCN-O0-NEXT: s_mov_b32 s4, 0
|
|
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
|
|
; GCN-O0-NEXT: s_mov_b32 s5, s2
|
|
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
|
|
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
|
|
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
|
|
; GCN-O0-NEXT: s_branch .LBB1_4
|
|
; GCN-O0-NEXT: .LBB1_3: ; %Flow
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 2
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 3
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN-O0-NEXT: s_branch .LBB1_5
|
|
; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s2, v4, 4
|
|
; GCN-O0-NEXT: v_readlane_b32 s3, v4, 5
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
|
|
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
|
|
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
|
|
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
|
|
; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], v0
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
|
|
; GCN-O0-NEXT: s_mov_b32 s4, 0
|
|
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
|
|
; GCN-O0-NEXT: s_mov_b32 s5, s2
|
|
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
|
|
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
|
|
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
|
|
; GCN-O0-NEXT: s_branch .LBB1_3
|
|
; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, 3
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
|
|
; GCN-O0-NEXT: s_mov_b32 m0, -1
|
|
; GCN-O0-NEXT: ds_write_b32 v0, v1
|
|
; GCN-O0-NEXT: s_endpgm
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp1 = icmp ugt i32 %tmp, 1
|
|
br i1 %tmp1, label %bb.outer.then, label %bb.outer.end
|
|
|
|
bb.outer.then: ; preds = %bb
|
|
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
|
|
store i32 0, ptr addrspace(1) %tmp4, align 4
|
|
%tmp5 = icmp eq i32 %tmp, 2
|
|
br i1 %tmp5, label %bb.inner.end, label %bb.inner.then
|
|
|
|
bb.inner.then: ; preds = %bb.outer.then
|
|
%tmp7 = add i32 %tmp, 1
|
|
%tmp8 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
|
|
store i32 1, ptr addrspace(1) %tmp8, align 4
|
|
br label %bb.inner.end
|
|
|
|
bb.inner.end: ; preds = %bb.inner.then, %bb.outer.then
|
|
%tmp9 = add i32 %tmp, 2
|
|
%tmp10 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp9
|
|
store i32 2, ptr addrspace(1) %tmp10, align 4
|
|
br label %bb.outer.end
|
|
|
|
bb.outer.end: ; preds = %bb.inner.then, %bb
|
|
store i32 3, ptr addrspace(3) null
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
|
|
; GCN-LABEL: nested_if_if_else:
|
|
; GCN: ; %bb.0: ; %bb
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, 0
|
|
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64
|
|
; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc
|
|
; GCN-NEXT: s_cbranch_execz .LBB2_5
|
|
; GCN-NEXT: ; %bb.1: ; %bb.outer.then
|
|
; GCN-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v1
|
|
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
|
|
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
|
|
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GCN-NEXT: s_cbranch_execz .LBB2_3
|
|
; GCN-NEXT: ; %bb.2: ; %bb.else
|
|
; GCN-NEXT: s_mov_b32 s6, 0
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s4, s6
|
|
; GCN-NEXT: s_mov_b32 s5, s6
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 2
|
|
; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:8
|
|
; GCN-NEXT: ; implicit-def: $vgpr3_vgpr4
|
|
; GCN-NEXT: .LBB2_3: ; %Flow
|
|
; GCN-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GCN-NEXT: s_cbranch_execz .LBB2_5
|
|
; GCN-NEXT: ; %bb.4: ; %bb.then
|
|
; GCN-NEXT: s_mov_b32 s6, 0
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s4, s6
|
|
; GCN-NEXT: s_mov_b32 s5, s6
|
|
; GCN-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 1
|
|
; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:4
|
|
; GCN-NEXT: .LBB2_5: ; %bb.outer.end
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GCN-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 3
|
|
; GCN-NEXT: s_mov_b32 m0, -1
|
|
; GCN-NEXT: ds_write_b32 v2, v0
|
|
; GCN-NEXT: s_endpgm
|
|
;
|
|
; GCN-O0-LABEL: nested_if_if_else:
|
|
; GCN-O0: ; %bb.0: ; %bb
|
|
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN-O0-NEXT: s_mov_b32 s14, -1
|
|
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
|
|
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1]
|
|
; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s2, 0
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s3, 1
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
|
|
; GCN-O0-NEXT: s_mov_b32 s4, 0
|
|
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
|
|
; GCN-O0-NEXT: s_mov_b32 s5, s2
|
|
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
|
|
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
|
|
; GCN-O0-NEXT: s_mov_b32 s4, 2
|
|
; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s4, v0
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 1
|
|
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0
|
|
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s0, 2
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s1, 3
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB2_6
|
|
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 2
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0
|
|
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
|
|
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
|
|
; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s2, 4
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s3, 5
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB2_2
|
|
; GCN-O0-NEXT: s_branch .LBB2_4
|
|
; GCN-O0-NEXT: .LBB2_2: ; %Flow
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 4
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 5
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
|
|
; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1]
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s0, 6
|
|
; GCN-O0-NEXT: v_writelane_b32 v4, s1, 7
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB2_5
|
|
; GCN-O0-NEXT: ; %bb.3: ; %bb.then
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
|
|
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
|
|
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
|
|
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 2
|
|
; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
|
|
; GCN-O0-NEXT: s_mov_b32 s4, 0
|
|
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
|
|
; GCN-O0-NEXT: s_mov_b32 s5, s2
|
|
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
|
|
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
|
|
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
|
|
; GCN-O0-NEXT: s_branch .LBB2_5
|
|
; GCN-O0-NEXT: .LBB2_4: ; %bb.else
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
|
|
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
|
|
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
|
|
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
|
|
; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], v0
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
|
|
; GCN-O0-NEXT: s_mov_b32 s4, 0
|
|
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
|
|
; GCN-O0-NEXT: s_mov_b32 s5, s2
|
|
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
|
|
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
|
|
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
|
|
; GCN-O0-NEXT: s_branch .LBB2_2
|
|
; GCN-O0-NEXT: .LBB2_5: ; %Flow1
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 6
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 7
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN-O0-NEXT: .LBB2_6: ; %bb.outer.end
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 2
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 3
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, 3
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
|
|
; GCN-O0-NEXT: s_mov_b32 m0, -1
|
|
; GCN-O0-NEXT: ds_write_b32 v0, v1
|
|
; GCN-O0-NEXT: s_endpgm
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
|
|
store i32 0, ptr addrspace(1) %tmp1, align 4
|
|
%tmp2 = icmp ugt i32 %tmp, 1
|
|
br i1 %tmp2, label %bb.outer.then, label %bb.outer.end
|
|
|
|
bb.outer.then: ; preds = %bb
|
|
%tmp5 = icmp eq i32 %tmp, 2
|
|
br i1 %tmp5, label %bb.then, label %bb.else
|
|
|
|
bb.then: ; preds = %bb.outer.then
|
|
%tmp3 = add i32 %tmp, 1
|
|
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp3
|
|
store i32 1, ptr addrspace(1) %tmp4, align 4
|
|
br label %bb.outer.end
|
|
|
|
bb.else: ; preds = %bb.outer.then
|
|
%tmp7 = add i32 %tmp, 2
|
|
%tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
|
|
store i32 2, ptr addrspace(1) %tmp9, align 4
|
|
br label %bb.outer.end
|
|
|
|
bb.outer.end: ; preds = %bb, %bb.then, %bb.else
|
|
store i32 3, ptr addrspace(3) null
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
|
|
; GCN-LABEL: nested_if_else_if:
|
|
; GCN: ; %bb.0: ; %bb
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
|
|
; GCN-NEXT: v_mov_b32_e32 v4, 0
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, 0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v2, s1
|
|
; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3
|
|
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
|
|
; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 2, v0
|
|
; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64
|
|
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[0:1]
|
|
; GCN-NEXT: s_cbranch_execz .LBB3_4
|
|
; GCN-NEXT: ; %bb.1: ; %bb.outer.else
|
|
; GCN-NEXT: s_mov_b32 s0, s2
|
|
; GCN-NEXT: s_mov_b32 s1, s2
|
|
; GCN-NEXT: v_mov_b32_e32 v3, 3
|
|
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
|
; GCN-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:12
|
|
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GCN-NEXT: s_cbranch_execz .LBB3_3
|
|
; GCN-NEXT: ; %bb.2: ; %bb.inner.then2
|
|
; GCN-NEXT: s_mov_b32 s10, 0
|
|
; GCN-NEXT: s_mov_b32 s11, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s8, s10
|
|
; GCN-NEXT: s_mov_b32 s9, s10
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 4
|
|
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[8:11], 0 addr64 offset:16
|
|
; GCN-NEXT: .LBB3_3: ; %Flow
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2
|
|
; GCN-NEXT: ; implicit-def: $vgpr0
|
|
; GCN-NEXT: .LBB3_4: ; %Flow2
|
|
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GCN-NEXT: s_cbranch_execz .LBB3_8
|
|
; GCN-NEXT: ; %bb.5: ; %bb.outer.then
|
|
; GCN-NEXT: s_mov_b32 s0, s2
|
|
; GCN-NEXT: s_mov_b32 s1, s2
|
|
; GCN-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v3, 1
|
|
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
|
|
; GCN-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:4
|
|
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
|
|
; GCN-NEXT: s_cbranch_execz .LBB3_7
|
|
; GCN-NEXT: ; %bb.6: ; %bb.inner.then
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 2
|
|
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
|
|
; GCN-NEXT: .LBB3_7: ; %Flow1
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GCN-NEXT: .LBB3_8: ; %bb.outer.end
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 3
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NEXT: s_mov_b32 m0, -1
|
|
; GCN-NEXT: ds_write_b32 v1, v0
|
|
; GCN-NEXT: s_endpgm
|
|
;
|
|
; GCN-O0-LABEL: nested_if_else_if:
|
|
; GCN-O0: ; %bb.0: ; %bb
|
|
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN-O0-NEXT: s_mov_b32 s14, -1
|
|
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
|
|
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 2
|
|
; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s0, v0
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-O0-NEXT: s_mov_b32 s2, s4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, v2
|
|
; GCN-O0-NEXT: s_mov_b32 s1, s5
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v5, v3
|
|
; GCN-O0-NEXT: v_add_i32_e64 v4, s[2:3], s2, v1
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-O0-NEXT: v_addc_u32_e64 v1, s[2:3], v1, v5, s[2:3]
|
|
; GCN-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v5, v1
|
|
; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 0
|
|
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
|
|
; GCN-O0-NEXT: s_mov_b32 s3, s1
|
|
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
|
|
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3]
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[4:7], 0 addr64
|
|
; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v0, s0
|
|
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
|
|
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
|
|
; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
|
|
; GCN-O0-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
|
|
; GCN-O0-NEXT: v_writelane_b32 v6, s2, 0
|
|
; GCN-O0-NEXT: v_writelane_b32 v6, s3, 1
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB3_1
|
|
; GCN-O0-NEXT: s_branch .LBB3_4
|
|
; GCN-O0-NEXT: .LBB3_1: ; %Flow2
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v6, 0
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v6, 1
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
|
|
; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1]
|
|
; GCN-O0-NEXT: v_writelane_b32 v6, s0, 2
|
|
; GCN-O0-NEXT: v_writelane_b32 v6, s1, 3
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB3_8
|
|
; GCN-O0-NEXT: ; %bb.2: ; %bb.outer.then
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 0
|
|
; GCN-O0-NEXT: s_mov_b32 s4, s2
|
|
; GCN-O0-NEXT: s_mov_b32 s5, s0
|
|
; GCN-O0-NEXT: s_mov_b32 s0, s2
|
|
; GCN-O0-NEXT: s_mov_b32 s1, s2
|
|
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
|
|
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, 1
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 offset:4
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 2
|
|
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0
|
|
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
|
|
; GCN-O0-NEXT: v_writelane_b32 v6, s0, 4
|
|
; GCN-O0-NEXT: v_writelane_b32 v6, s1, 5
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB3_7
|
|
; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(1)
|
|
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 0
|
|
; GCN-O0-NEXT: s_mov_b32 s4, s2
|
|
; GCN-O0-NEXT: s_mov_b32 s5, s0
|
|
; GCN-O0-NEXT: s_mov_b32 s0, s2
|
|
; GCN-O0-NEXT: s_mov_b32 s1, s2
|
|
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
|
|
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
|
|
; GCN-O0-NEXT: s_branch .LBB3_7
|
|
; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 0
|
|
; GCN-O0-NEXT: s_mov_b32 s2, s0
|
|
; GCN-O0-NEXT: s_mov_b32 s3, s1
|
|
; GCN-O0-NEXT: s_mov_b32 s4, s0
|
|
; GCN-O0-NEXT: s_mov_b32 s5, s0
|
|
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
|
|
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3]
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, 3
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[4:7], 0 addr64 offset:12
|
|
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0
|
|
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
|
|
; GCN-O0-NEXT: v_writelane_b32 v6, s0, 6
|
|
; GCN-O0-NEXT: v_writelane_b32 v6, s1, 7
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB3_6
|
|
; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(1)
|
|
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 0
|
|
; GCN-O0-NEXT: s_mov_b32 s4, s2
|
|
; GCN-O0-NEXT: s_mov_b32 s5, s0
|
|
; GCN-O0-NEXT: s_mov_b32 s0, s2
|
|
; GCN-O0-NEXT: s_mov_b32 s1, s2
|
|
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
|
|
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, 4
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:16
|
|
; GCN-O0-NEXT: .LBB3_6: ; %Flow
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v6, 6
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v6, 7
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN-O0-NEXT: s_branch .LBB3_1
|
|
; GCN-O0-NEXT: .LBB3_7: ; %Flow1
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v6, 4
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v6, 5
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN-O0-NEXT: .LBB3_8: ; %bb.outer.end
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v6, 2
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v6, 3
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, 3
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
|
|
; GCN-O0-NEXT: s_mov_b32 m0, -1
|
|
; GCN-O0-NEXT: ds_write_b32 v0, v1
|
|
; GCN-O0-NEXT: s_endpgm
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
|
|
store i32 0, ptr addrspace(1) %tmp1, align 4
|
|
%cc1 = icmp ugt i32 %tmp, 1
|
|
br i1 %cc1, label %bb.outer.then, label %bb.outer.else
|
|
|
|
bb.outer.then:
|
|
%tmp2 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 1
|
|
store i32 1, ptr addrspace(1) %tmp2, align 4
|
|
%cc2 = icmp eq i32 %tmp, 2
|
|
br i1 %cc2, label %bb.inner.then, label %bb.outer.end
|
|
|
|
bb.inner.then:
|
|
%tmp3 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 2
|
|
store i32 2, ptr addrspace(1) %tmp3, align 4
|
|
br label %bb.outer.end
|
|
|
|
bb.outer.else:
|
|
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 3
|
|
store i32 3, ptr addrspace(1) %tmp4, align 4
|
|
%cc3 = icmp eq i32 %tmp, 0 ; avoid being optimized away through the domination
|
|
br i1 %cc3, label %bb.inner.then2, label %bb.outer.end
|
|
|
|
bb.inner.then2:
|
|
%tmp5 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 4
|
|
store i32 4, ptr addrspace(1) %tmp5, align 4
|
|
br label %bb.outer.end
|
|
|
|
bb.outer.end:
|
|
store i32 3, ptr addrspace(3) null
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %arg) {
|
|
; GCN-LABEL: s_endpgm_unsafe_barrier:
|
|
; GCN: ; %bb.0: ; %bb
|
|
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
|
|
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GCN-NEXT: s_cbranch_execz .LBB4_2
|
|
; GCN-NEXT: ; %bb.1: ; %bb.then
|
|
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s6, 0
|
|
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_dword v1, v[0:1], s[4:7], 0 addr64
|
|
; GCN-NEXT: .LBB4_2: ; %bb.end
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
|
|
; GCN-NEXT: s_barrier
|
|
; GCN-NEXT: s_endpgm
|
|
;
|
|
; GCN-O0-LABEL: s_endpgm_unsafe_barrier:
|
|
; GCN-O0: ; %bb.0: ; %bb
|
|
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN-O0-NEXT: s_mov_b32 s14, -1
|
|
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
|
|
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
|
|
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-O0-NEXT: v_writelane_b32 v3, s0, 0
|
|
; GCN-O0-NEXT: v_writelane_b32 v3, s1, 1
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
|
|
; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b32 s0, 1
|
|
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0
|
|
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
|
|
; GCN-O0-NEXT: v_writelane_b32 v3, s0, 2
|
|
; GCN-O0-NEXT: v_writelane_b32 v3, s1, 3
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v3, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB4_2
|
|
; GCN-O0-NEXT: ; %bb.1: ; %bb.then
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v3, 0
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v3, 1
|
|
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
|
|
; GCN-O0-NEXT: s_mov_b32 s4, 0
|
|
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
|
|
; GCN-O0-NEXT: s_mov_b32 s5, s2
|
|
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
|
|
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_ashrrev_i32_e64 v2, 31, v0
|
|
; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, v2
|
|
; GCN-O0-NEXT: s_mov_b32 s4, 2
|
|
; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[0:1], s4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
|
|
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
|
|
; GCN-O0-NEXT: .LBB4_2: ; %bb.end
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s0, v3, 2
|
|
; GCN-O0-NEXT: v_readlane_b32 s1, v3, 3
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GCN-O0-NEXT: s_barrier
|
|
; GCN-O0-NEXT: s_endpgm
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp1 = icmp ugt i32 %tmp, 1
|
|
br i1 %tmp1, label %bb.then, label %bb.end
|
|
|
|
bb.then: ; preds = %bb
|
|
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
|
|
store i32 0, ptr addrspace(1) %tmp4, align 4
|
|
br label %bb.end
|
|
|
|
bb.end: ; preds = %bb.then, %bb
|
|
call void @llvm.amdgcn.s.barrier()
|
|
ret void
|
|
}
|
|
|
|
define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
|
|
; GCN-LABEL: scc_liveness:
|
|
; GCN: ; %bb.0: ; %bb
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_movk_i32 s4, 0x207
|
|
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
|
|
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
|
|
; GCN-NEXT: s_mov_b64 s[8:9], 0
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NEXT: s_mov_b64 s[6:7], 0
|
|
; GCN-NEXT: s_branch .LBB5_3
|
|
; GCN-NEXT: .LBB5_1: ; %Flow
|
|
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[12:13]
|
|
; GCN-NEXT: .LBB5_2: ; %bb10
|
|
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[10:11]
|
|
; GCN-NEXT: s_and_b64 s[6:7], exec, s[4:5]
|
|
; GCN-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
|
|
; GCN-NEXT: s_mov_b64 s[6:7], 0
|
|
; GCN-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GCN-NEXT: s_cbranch_execz .LBB5_7
|
|
; GCN-NEXT: .LBB5_3: ; %bb1
|
|
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN-NEXT: s_and_b64 s[10:11], exec, vcc
|
|
; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
|
|
; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GCN-NEXT: s_cbranch_execnz .LBB5_3
|
|
; GCN-NEXT: ; %bb.4: ; %bb2
|
|
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v1
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-NEXT: v_mov_b32_e32 v4, v1
|
|
; GCN-NEXT: v_mov_b32_e32 v5, v4
|
|
; GCN-NEXT: v_mov_b32_e32 v4, v3
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v2
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v1
|
|
; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[4:5]
|
|
; GCN-NEXT: s_cbranch_execz .LBB5_2
|
|
; GCN-NEXT: ; %bb.5: ; %bb4
|
|
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
|
|
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v1
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-NEXT: v_mov_b32_e32 v4, v1
|
|
; GCN-NEXT: v_mov_b32_e32 v5, v4
|
|
; GCN-NEXT: v_mov_b32_e32 v4, v3
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v2
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v1
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v0
|
|
; GCN-NEXT: s_and_saveexec_b64 s[12:13], s[6:7]
|
|
; GCN-NEXT: s_cbranch_execz .LBB5_1
|
|
; GCN-NEXT: ; %bb.6: ; %bb8
|
|
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
|
|
; GCN-NEXT: v_mov_b32_e32 v5, v3
|
|
; GCN-NEXT: v_mov_b32_e32 v4, v2
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN-NEXT: s_branch .LBB5_1
|
|
; GCN-NEXT: .LBB5_7: ; %bb12
|
|
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-O0-LABEL: scc_liveness:
|
|
; GCN-O0: ; %bb.0: ; %bb
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
|
|
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; GCN-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(1)
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s6, 0
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s7, 1
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s4, 2
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s5, 3
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: .LBB5_1: ; %bb1
|
|
; GCN-O0-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s8, v7, 2
|
|
; GCN-O0-NEXT: v_readlane_b32 s9, v7, 3
|
|
; GCN-O0-NEXT: v_readlane_b32 s6, v7, 0
|
|
; GCN-O0-NEXT: v_readlane_b32 s7, v7, 1
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s6, 4
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s7, 5
|
|
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b32 s4, 0x207
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v0, s4
|
|
; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s4, 6
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s5, 7
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s6, 0
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s7, 1
|
|
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s6, 2
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s7, 3
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1
|
|
; GCN-O0-NEXT: ; %bb.2: ; %bb2
|
|
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s4, v7, 6
|
|
; GCN-O0-NEXT: v_readlane_b32 s5, v7, 7
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b32 s6, 0
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s6
|
|
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s4, 8
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s5, 9
|
|
; GCN-O0-NEXT: s_mov_b32 s4, 0
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v6, s4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, v6
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v2, v5
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
|
|
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s4, 10
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s5, 11
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB5_5
|
|
; GCN-O0-NEXT: ; %bb.3: ; %bb4
|
|
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: ; implicit-def: $sgpr4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN-O0-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
|
|
; GCN-O0-NEXT: s_mov_b32 s4, 0
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[6:7], v0, s4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v6, s4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, v6
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v2, v5
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
|
|
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s4, 12
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s5, 13
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB5_6
|
|
; GCN-O0-NEXT: ; %bb.4: ; %bb8
|
|
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
|
; GCN-O0-NEXT: s_mov_b32 s10, 0
|
|
; GCN-O0-NEXT: ; implicit-def: $sgpr4
|
|
; GCN-O0-NEXT: ; implicit-def: $sgpr5
|
|
; GCN-O0-NEXT: ; implicit-def: $sgpr9
|
|
; GCN-O0-NEXT: ; implicit-def: $sgpr5
|
|
; GCN-O0-NEXT: ; implicit-def: $sgpr8
|
|
; GCN-O0-NEXT: ; implicit-def: $sgpr5
|
|
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
|
|
; GCN-O0-NEXT: s_mov_b32 s5, s10
|
|
; GCN-O0-NEXT: s_mov_b32 s6, s9
|
|
; GCN-O0-NEXT: s_mov_b32 s7, s8
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(1)
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_branch .LBB5_6
|
|
; GCN-O0-NEXT: .LBB5_5: ; %Flow2
|
|
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s4, v7, 10
|
|
; GCN-O0-NEXT: v_readlane_b32 s5, v7, 11
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_branch .LBB5_7
|
|
; GCN-O0-NEXT: .LBB5_6: ; %Flow
|
|
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s4, v7, 12
|
|
; GCN-O0-NEXT: v_readlane_b32 s5, v7, 13
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_branch .LBB5_5
|
|
; GCN-O0-NEXT: .LBB5_7: ; %bb10
|
|
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s6, v7, 8
|
|
; GCN-O0-NEXT: v_readlane_b32 s7, v7, 9
|
|
; GCN-O0-NEXT: s_mov_b64 s[4:5], -1
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s4, 14
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s5, 15
|
|
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s4, 16
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s5, 17
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
|
|
; GCN-O0-NEXT: s_cbranch_execz .LBB5_9
|
|
; GCN-O0-NEXT: ; %bb.8: ; %Flow1
|
|
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: s_mov_b64 s[4:5], 0
|
|
; GCN-O0-NEXT: s_xor_b64 s[4:5], exec, -1
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s4, 14
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s5, 15
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: .LBB5_9: ; %Flow3
|
|
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s8, v7, 16
|
|
; GCN-O0-NEXT: v_readlane_b32 s9, v7, 17
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GCN-O0-NEXT: v_readlane_b32 s6, v7, 4
|
|
; GCN-O0-NEXT: v_readlane_b32 s7, v7, 5
|
|
; GCN-O0-NEXT: v_readlane_b32 s4, v7, 14
|
|
; GCN-O0-NEXT: v_readlane_b32 s5, v7, 15
|
|
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_and_b64 s[4:5], exec, s[4:5]
|
|
; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
|
|
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0
|
|
; GCN-O0-NEXT: s_mov_b64 s[8:9], s[4:5]
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s8, 0
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s9, 1
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s6, 2
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s7, 3
|
|
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s6, 18
|
|
; GCN-O0-NEXT: v_writelane_b32 v7, s7, 19
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
|
|
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
|
|
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
|
|
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
|
|
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
|
; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1
|
|
; GCN-O0-NEXT: ; %bb.10: ; %bb12
|
|
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(4)
|
|
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_readlane_b32 s4, v7, 18
|
|
; GCN-O0-NEXT: v_readlane_b32 s5, v7, 19
|
|
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GCN-O0-NEXT: ; %bb.11: ; %bb12
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(3)
|
|
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(2)
|
|
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(1)
|
|
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v4, v3
|
|
; GCN-O0-NEXT: ; implicit-def: $sgpr4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN-O0-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v4, v2
|
|
; GCN-O0-NEXT: ; implicit-def: $sgpr4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN-O0-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
|
|
; GCN-O0-NEXT: ; implicit-def: $sgpr4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN-O0-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
|
|
; GCN-O0-NEXT: ; implicit-def: $sgpr4
|
|
; GCN-O0-NEXT: v_mov_b32_e32 v1, s4
|
|
; GCN-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
|
|
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
|
|
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
|
|
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
|
|
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
|
|
bb:
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %Flow1, %bb1, %bb
|
|
%tmp = icmp slt i32 %arg, 519
|
|
br i1 %tmp, label %bb2, label %bb1
|
|
|
|
bb2: ; preds = %bb1
|
|
%tmp3 = icmp eq i32 %arg, 0
|
|
br i1 %tmp3, label %bb4, label %bb10
|
|
|
|
bb4: ; preds = %bb2
|
|
%tmp6 = load float, ptr addrspace(5) poison
|
|
%tmp7 = fcmp olt float %tmp6, 0.0
|
|
br i1 %tmp7, label %bb8, label %Flow
|
|
|
|
bb8: ; preds = %bb4
|
|
%tmp9 = insertelement <4 x float> poison, float 0.0, i32 1
|
|
br label %Flow
|
|
|
|
Flow: ; preds = %bb8, %bb4
|
|
%tmp8 = phi <4 x float> [ %tmp9, %bb8 ], [ zeroinitializer, %bb4 ]
|
|
br label %bb10
|
|
|
|
bb10: ; preds = %Flow, %bb2
|
|
%tmp11 = phi <4 x float> [ zeroinitializer, %bb2 ], [ %tmp8, %Flow ]
|
|
br i1 %tmp3, label %bb12, label %Flow1
|
|
|
|
Flow1: ; preds = %bb10
|
|
br label %bb1
|
|
|
|
bb12: ; preds = %bb10
|
|
store volatile <4 x float> %tmp11, ptr addrspace(5) poison, align 16
|
|
ret void
|
|
}
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
|
declare void @llvm.amdgcn.s.barrier() #1
|
|
|
|
attributes #0 = { nounwind readnone speculatable }
|
|
attributes #1 = { nounwind convergent }
|
|
attributes #2 = { nounwind }
|
|
|