The mov64 pseudo is split into two 32 bit movs, but those 32 bit movs had the full 64-bit register still implicitly defined. VOPD formation is affected, so we can emit more of them.
584 lines
30 KiB
LLVM
584 lines
30 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -o - %s | FileCheck -check-prefix=CHECK %s
|
|
|
|
define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" {
|
|
; CHECK-LABEL: max_6_vgprs:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: global_load_b32 v2, v[0:1], off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; CHECK-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
|
|
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
|
|
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
|
|
; CHECK-NEXT: global_load_b32 v5, v[0:1], off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v2, off ; 4-byte Folded Spill
|
|
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:4 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:8 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:160 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v0, off offset:12 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: ;;#ASMSTART
|
|
; CHECK-NEXT: ;;#ASMEND
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v5, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: s_endpgm
|
|
%tid = load volatile i32, ptr addrspace(1) poison
|
|
%p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %tid
|
|
%p2 = getelementptr inbounds i32, ptr addrspace(1) %p1, i32 4
|
|
%p3 = getelementptr inbounds i32, ptr addrspace(1) %p2, i32 8
|
|
%p4 = getelementptr inbounds i32, ptr addrspace(1) %p3, i32 12
|
|
%p5 = getelementptr inbounds i32, ptr addrspace(1) %p4, i32 16
|
|
%v1 = load volatile i32, ptr addrspace(1) %p1
|
|
%v2 = load volatile i32, ptr addrspace(1) %p2
|
|
%v3 = load volatile i32, ptr addrspace(1) %p3
|
|
%v4 = load volatile i32, ptr addrspace(1) %p4
|
|
%v5 = load volatile i32, ptr addrspace(1) %p5
|
|
call void asm sideeffect "", "~{v[0:4]}" ()
|
|
store volatile i32 %v1, ptr addrspace(1) poison
|
|
store volatile i32 %v2, ptr addrspace(1) poison
|
|
store volatile i32 %v3, ptr addrspace(1) poison
|
|
store volatile i32 %v4, ptr addrspace(1) poison
|
|
store volatile i32 %v5, ptr addrspace(1) poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgpu-num-vgpr"="11" {
|
|
; CHECK-LABEL: max_11_vgprs_branch:
|
|
; CHECK: ; %bb.0: ; %.entry
|
|
; CHECK-NEXT: global_load_b32 v3, v[0:1], off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: s_mov_b32 s0, exec_lo
|
|
; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
|
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; CHECK-NEXT: v_lshlrev_b64_e32 v[3:4], 2, v[3:4]
|
|
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
|
|
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
|
|
; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:336 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:448 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v3, off offset:12 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:576 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v3, off ; 4-byte Folded Spill
|
|
; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:720 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: v_cmpx_eq_u32_e32 0, v2
|
|
; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0
|
|
; CHECK-NEXT: s_cbranch_execz .LBB1_2
|
|
; CHECK-NEXT: ; %bb.1: ; %.false
|
|
; CHECK-NEXT: global_load_b32 v10, v[0:1], off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:16 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:160 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:240 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: ;;#ASMSTART
|
|
; CHECK-NEXT: ;;#ASMEND
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:24 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:28 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:32 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: ; implicit-def: $vgpr0
|
|
; CHECK-NEXT: ; kill: killed $vgpr0
|
|
; CHECK-NEXT: ; implicit-def: $vgpr0
|
|
; CHECK-NEXT: ; kill: killed $vgpr0
|
|
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; CHECK-NEXT: .LBB1_2: ; %Flow
|
|
; CHECK-NEXT: s_and_not1_saveexec_b32 s0, s0
|
|
; CHECK-NEXT: s_cbranch_execz .LBB1_4
|
|
; CHECK-NEXT: ; %bb.3: ; %.true
|
|
; CHECK-NEXT: global_load_b32 v10, v[0:1], off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:16 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:160 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:240 scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: ;;#ASMSTART
|
|
; CHECK-NEXT: ;;#ASMEND
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:24 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:28 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:32 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: .LBB1_4: ; %.exit
|
|
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
|
|
; CHECK-NEXT: s_wait_storecnt 0x0
|
|
; CHECK-NEXT: s_endpgm
|
|
.entry:
|
|
%tid = load volatile i32, ptr addrspace(1) poison
|
|
%p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %tid
|
|
%p2 = getelementptr inbounds i32, ptr addrspace(1) %p1, i32 4
|
|
%p3 = getelementptr inbounds i32, ptr addrspace(1) %p2, i32 8
|
|
%p4 = getelementptr inbounds i32, ptr addrspace(1) %p3, i32 12
|
|
%p5 = getelementptr inbounds i32, ptr addrspace(1) %p4, i32 16
|
|
%p6 = getelementptr inbounds i32, ptr addrspace(1) %p5, i32 20
|
|
%p7 = getelementptr inbounds i32, ptr addrspace(1) %p6, i32 24
|
|
%p8 = getelementptr inbounds i32, ptr addrspace(1) %p7, i32 28
|
|
%p9 = getelementptr inbounds i32, ptr addrspace(1) %p8, i32 32
|
|
%p10 = getelementptr inbounds i32, ptr addrspace(1) %p9, i32 36
|
|
%v7 = load volatile i32, ptr addrspace(1) %p7
|
|
%v8 = load volatile i32, ptr addrspace(1) %p8
|
|
%v9 = load volatile i32, ptr addrspace(1) %p9
|
|
%v10 = load volatile i32, ptr addrspace(1) %p10
|
|
%cmp = icmp ne i32 %tmp, 0
|
|
br i1 %cmp, label %.true, label %.false
|
|
|
|
.true:
|
|
%v1_t = load volatile i32, ptr addrspace(1) %p1
|
|
%v2_t = load volatile i32, ptr addrspace(1) %p2
|
|
%v3_t = load volatile i32, ptr addrspace(1) %p3
|
|
%v4_t = load volatile i32, ptr addrspace(1) %p4
|
|
%v5_t = load volatile i32, ptr addrspace(1) %p5
|
|
%v6_t = load volatile i32, ptr addrspace(1) %p6
|
|
call void asm sideeffect "", "~{v[0:9]}" ()
|
|
store volatile i32 %v1_t, ptr addrspace(1) poison
|
|
store volatile i32 %v2_t, ptr addrspace(1) poison
|
|
store volatile i32 %v3_t, ptr addrspace(1) poison
|
|
store volatile i32 %v4_t, ptr addrspace(1) poison
|
|
store volatile i32 %v5_t, ptr addrspace(1) poison
|
|
store volatile i32 %v6_t, ptr addrspace(1) poison
|
|
store volatile i32 %v7, ptr addrspace(1) poison
|
|
store volatile i32 %v8, ptr addrspace(1) poison
|
|
|
|
br label %.exit
|
|
|
|
.false:
|
|
%v1_f = load volatile i32, ptr addrspace(1) %p1
|
|
%v2_f = load volatile i32, ptr addrspace(1) %p2
|
|
%v3_f = load volatile i32, ptr addrspace(1) %p3
|
|
%v4_f = load volatile i32, ptr addrspace(1) %p4
|
|
%v5_f = load volatile i32, ptr addrspace(1) %p5
|
|
%v6_f = load volatile i32, ptr addrspace(1) %p6
|
|
call void asm sideeffect "", "~{v[0:9]}" ()
|
|
store volatile i32 %v1_f, ptr addrspace(1) poison
|
|
store volatile i32 %v2_f, ptr addrspace(1) poison
|
|
store volatile i32 %v3_f, ptr addrspace(1) poison
|
|
store volatile i32 %v4_f, ptr addrspace(1) poison
|
|
store volatile i32 %v5_f, ptr addrspace(1) poison
|
|
store volatile i32 %v6_f, ptr addrspace(1) poison
|
|
store volatile i32 %v7, ptr addrspace(1) poison
|
|
store volatile i32 %v8, ptr addrspace(1) poison
|
|
|
|
br label %.exit
|
|
|
|
.exit:
|
|
store volatile i32 %v9, ptr addrspace(1) poison
|
|
store volatile i32 %v10, ptr addrspace(1) poison
|
|
ret void
|
|
}
|
|
|
|
|
|
declare i32 @foo() nounwind
|
|
declare <8 x half> @bar(<32 x i64>) nounwind
|
|
|
|
define <8 x half> @baz() nounwind {
|
|
; CHECK-LABEL: baz:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; CHECK-NEXT: s_wait_expcnt 0x0
|
|
; CHECK-NEXT: s_wait_samplecnt 0x0
|
|
; CHECK-NEXT: s_wait_bvhcnt 0x0
|
|
; CHECK-NEXT: s_wait_kmcnt 0x0
|
|
; CHECK-NEXT: s_mov_b32 s0, s33
|
|
; CHECK-NEXT: s_mov_b32 s33, s32
|
|
; CHECK-NEXT: s_or_saveexec_b32 s1, -1
|
|
; CHECK-NEXT: scratch_store_b32 off, v93, s33 offset:404 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; CHECK-NEXT: s_mov_b32 exec_lo, s1
|
|
; CHECK-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
|
; CHECK-NEXT: scratch_store_b32 off, v40, s33 offset:144
|
|
; CHECK-NEXT: scratch_store_b32 off, v41, s33 offset:140
|
|
; CHECK-NEXT: scratch_store_b32 off, v42, s33 offset:136
|
|
; CHECK-NEXT: scratch_store_b32 off, v43, s33 offset:132
|
|
; CHECK-NEXT: scratch_store_b32 off, v44, s33 offset:128
|
|
; CHECK-NEXT: scratch_store_b32 off, v45, s33 offset:124
|
|
; CHECK-NEXT: scratch_store_b32 off, v46, s33 offset:120
|
|
; CHECK-NEXT: scratch_store_b32 off, v47, s33 offset:116
|
|
; CHECK-NEXT: scratch_store_b32 off, v56, s33 offset:112
|
|
; CHECK-NEXT: scratch_store_b32 off, v57, s33 offset:108
|
|
; CHECK-NEXT: scratch_store_b32 off, v58, s33 offset:104
|
|
; CHECK-NEXT: scratch_store_b32 off, v59, s33 offset:100
|
|
; CHECK-NEXT: scratch_store_b32 off, v60, s33 offset:96
|
|
; CHECK-NEXT: scratch_store_b32 off, v61, s33 offset:92
|
|
; CHECK-NEXT: scratch_store_b32 off, v62, s33 offset:88
|
|
; CHECK-NEXT: scratch_store_b32 off, v63, s33 offset:84
|
|
; CHECK-NEXT: scratch_store_b32 off, v72, s33 offset:80
|
|
; CHECK-NEXT: scratch_store_b32 off, v73, s33 offset:76
|
|
; CHECK-NEXT: scratch_store_b32 off, v74, s33 offset:72
|
|
; CHECK-NEXT: scratch_store_b32 off, v75, s33 offset:68
|
|
; CHECK-NEXT: scratch_store_b32 off, v76, s33 offset:64
|
|
; CHECK-NEXT: scratch_store_b32 off, v77, s33 offset:60
|
|
; CHECK-NEXT: scratch_store_b32 off, v78, s33 offset:56
|
|
; CHECK-NEXT: scratch_store_b32 off, v79, s33 offset:52
|
|
; CHECK-NEXT: scratch_store_b32 off, v88, s33 offset:48
|
|
; CHECK-NEXT: scratch_store_b32 off, v89, s33 offset:44
|
|
; CHECK-NEXT: scratch_store_b32 off, v90, s33 offset:40
|
|
; CHECK-NEXT: scratch_store_b32 off, v91, s33 offset:36
|
|
; CHECK-NEXT: scratch_store_b32 off, v92, s33 offset:32
|
|
; CHECK-NEXT: scratch_store_b32 off, v104, s33 offset:28
|
|
; CHECK-NEXT: scratch_store_b32 off, v105, s33 offset:24
|
|
; CHECK-NEXT: scratch_store_b32 off, v106, s33 offset:20
|
|
; CHECK-NEXT: s_clause 0x4 ; 20-byte Folded Spill
|
|
; CHECK-NEXT: scratch_store_b32 off, v107, s33 offset:16
|
|
; CHECK-NEXT: scratch_store_b32 off, v108, s33 offset:12
|
|
; CHECK-NEXT: scratch_store_b32 off, v109, s33 offset:8
|
|
; CHECK-NEXT: scratch_store_b32 off, v110, s33 offset:4
|
|
; CHECK-NEXT: scratch_store_b32 off, v111, s33
|
|
; CHECK-NEXT: v_dual_mov_b32 v92, v31 :: v_dual_mov_b32 v1, 0
|
|
; CHECK-NEXT: v_dual_mov_b32 v0, 0x60 :: v_dual_mov_b32 v3, 0
|
|
; CHECK-NEXT: v_dual_mov_b32 v2, 0x50 :: v_dual_mov_b32 v5, 0
|
|
; CHECK-NEXT: v_dual_mov_b32 v4, 64 :: v_dual_mov_b32 v7, 0
|
|
; CHECK-NEXT: v_mov_b32_e32 v6, 48
|
|
; CHECK-NEXT: s_clause 0x1
|
|
; CHECK-NEXT: global_load_b128 v[56:59], v[0:1], off
|
|
; CHECK-NEXT: global_load_b128 v[104:107], v[2:3], off
|
|
; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 0
|
|
; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x70
|
|
; CHECK-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, 0x80
|
|
; CHECK-NEXT: s_clause 0x1
|
|
; CHECK-NEXT: global_load_b128 v[108:111], v[4:5], off
|
|
; CHECK-NEXT: global_load_b128 v[60:63], v[6:7], off
|
|
; CHECK-NEXT: v_mov_b32_e32 v5, 0
|
|
; CHECK-NEXT: s_clause 0x4
|
|
; CHECK-NEXT: global_load_b128 v[72:75], v[0:1], off
|
|
; CHECK-NEXT: global_load_b128 v[10:13], v[2:3], off
|
|
; CHECK-NEXT: global_load_b128 v[14:17], v[8:9], off
|
|
; CHECK-NEXT: global_load_b128 v[18:21], v[8:9], off offset:16
|
|
; CHECK-NEXT: global_load_b128 v[22:25], v[8:9], off offset:32
|
|
; CHECK-NEXT: v_dual_mov_b32 v4, 32 :: v_dual_mov_b32 v7, 0
|
|
; CHECK-NEXT: v_mov_b32_e32 v6, 16
|
|
; CHECK-NEXT: s_clause 0x1
|
|
; CHECK-NEXT: global_load_b128 v[76:79], v[4:5], off
|
|
; CHECK-NEXT: global_load_b128 v[88:91], v[6:7], off
|
|
; CHECK-NEXT: v_writelane_b32 v93, s0, 14
|
|
; CHECK-NEXT: s_addk_co_i32 s32, 0x1a0
|
|
; CHECK-NEXT: s_getpc_b64 s[0:1]
|
|
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; CHECK-NEXT: s_sext_i32_i16 s1, s1
|
|
; CHECK-NEXT: s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12
|
|
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; CHECK-NEXT: s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24
|
|
; CHECK-NEXT: s_wait_loadcnt 0x5
|
|
; CHECK-NEXT: scratch_store_b128 off, v[10:13], s33 offset:148 ; 16-byte Folded Spill
|
|
; CHECK-NEXT: s_wait_loadcnt 0x4
|
|
; CHECK-NEXT: scratch_store_b128 off, v[14:17], s33 offset:164 ; 16-byte Folded Spill
|
|
; CHECK-NEXT: s_wait_loadcnt 0x3
|
|
; CHECK-NEXT: scratch_store_b128 off, v[18:21], s33 offset:180 ; 16-byte Folded Spill
|
|
; CHECK-NEXT: s_wait_loadcnt 0x2
|
|
; CHECK-NEXT: s_clause 0x4 ; 80-byte Folded Spill
|
|
; CHECK-NEXT: scratch_store_b128 off, v[22:25], s33 offset:196
|
|
; CHECK-NEXT: scratch_store_b128 off, v[26:29], s33 offset:212
|
|
; CHECK-NEXT: scratch_store_b128 off, v[30:33], s33 offset:228
|
|
; CHECK-NEXT: scratch_store_b128 off, v[34:37], s33 offset:244
|
|
; CHECK-NEXT: scratch_store_b128 off, v[38:41], s33 offset:260
|
|
; CHECK-NEXT: s_clause 0x4
|
|
; CHECK-NEXT: global_load_b128 v[10:13], v[8:9], off offset:48
|
|
; CHECK-NEXT: global_load_b128 v[14:17], v[8:9], off offset:64
|
|
; CHECK-NEXT: global_load_b128 v[18:21], v[8:9], off offset:80
|
|
; CHECK-NEXT: global_load_b128 v[22:25], v[8:9], off offset:96
|
|
; CHECK-NEXT: global_load_b128 v[26:29], v[8:9], off offset:112
|
|
; CHECK-NEXT: v_writelane_b32 v93, s30, 0
|
|
; CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
|
|
; CHECK-NEXT: s_wait_loadcnt 0x4
|
|
; CHECK-NEXT: scratch_store_b128 off, v[10:13], s33 offset:276 ; 16-byte Folded Spill
|
|
; CHECK-NEXT: s_wait_loadcnt 0x3
|
|
; CHECK-NEXT: scratch_store_b128 off, v[14:17], s33 offset:292 ; 16-byte Folded Spill
|
|
; CHECK-NEXT: s_wait_loadcnt 0x2
|
|
; CHECK-NEXT: scratch_store_b128 off, v[18:21], s33 offset:308 ; 16-byte Folded Spill
|
|
; CHECK-NEXT: s_wait_loadcnt 0x1
|
|
; CHECK-NEXT: scratch_store_b128 off, v[22:25], s33 offset:324 ; 16-byte Folded Spill
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: s_clause 0x3 ; 64-byte Folded Spill
|
|
; CHECK-NEXT: scratch_store_b128 off, v[26:29], s33 offset:340
|
|
; CHECK-NEXT: scratch_store_b128 off, v[30:33], s33 offset:356
|
|
; CHECK-NEXT: scratch_store_b128 off, v[34:37], s33 offset:372
|
|
; CHECK-NEXT: scratch_store_b128 off, v[38:41], s33 offset:388
|
|
; CHECK-NEXT: v_writelane_b32 v93, s31, 1
|
|
; CHECK-NEXT: v_writelane_b32 v93, s34, 2
|
|
; CHECK-NEXT: v_writelane_b32 v93, s35, 3
|
|
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
|
|
; CHECK-NEXT: v_writelane_b32 v93, s36, 4
|
|
; CHECK-NEXT: v_writelane_b32 v93, s37, 5
|
|
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
|
|
; CHECK-NEXT: v_writelane_b32 v93, s38, 6
|
|
; CHECK-NEXT: v_writelane_b32 v93, s39, 7
|
|
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
|
|
; CHECK-NEXT: v_writelane_b32 v93, s48, 8
|
|
; CHECK-NEXT: v_writelane_b32 v93, s49, 9
|
|
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
|
|
; CHECK-NEXT: v_writelane_b32 v93, s50, 10
|
|
; CHECK-NEXT: s_mov_b32 s50, s15
|
|
; CHECK-NEXT: v_writelane_b32 v93, s51, 11
|
|
; CHECK-NEXT: s_mov_b32 s51, s14
|
|
; CHECK-NEXT: v_writelane_b32 v93, s52, 12
|
|
; CHECK-NEXT: s_mov_b32 s52, s13
|
|
; CHECK-NEXT: v_writelane_b32 v93, s53, 13
|
|
; CHECK-NEXT: s_mov_b32 s53, s12
|
|
; CHECK-NEXT: s_wait_kmcnt 0x0
|
|
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
|
; CHECK-NEXT: s_clause 0x7 ; 128-byte Folded Reload
|
|
; CHECK-NEXT: scratch_load_b128 v[0:3], off, s33 offset:276 th:TH_LOAD_LU
|
|
; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:292 th:TH_LOAD_LU
|
|
; CHECK-NEXT: scratch_load_b128 v[8:11], off, s33 offset:308 th:TH_LOAD_LU
|
|
; CHECK-NEXT: scratch_load_b128 v[12:15], off, s33 offset:324 th:TH_LOAD_LU
|
|
; CHECK-NEXT: scratch_load_b128 v[16:19], off, s33 offset:340 th:TH_LOAD_LU
|
|
; CHECK-NEXT: scratch_load_b128 v[20:23], off, s33 offset:356 th:TH_LOAD_LU
|
|
; CHECK-NEXT: scratch_load_b128 v[24:27], off, s33 offset:372 th:TH_LOAD_LU
|
|
; CHECK-NEXT: scratch_load_b128 v[28:31], off, s33 offset:388 th:TH_LOAD_LU
|
|
; CHECK-NEXT: s_getpc_b64 s[0:1]
|
|
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; CHECK-NEXT: s_sext_i32_i16 s1, s1
|
|
; CHECK-NEXT: s_add_co_u32 s0, s0, bar@gotpcrel32@lo+12
|
|
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; CHECK-NEXT: s_add_co_ci_u32 s1, s1, bar@gotpcrel32@hi+24
|
|
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
|
|
; CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
|
|
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
|
|
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
|
|
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
|
|
; CHECK-NEXT: s_mov_b32 s12, s53
|
|
; CHECK-NEXT: s_mov_b32 s13, s52
|
|
; CHECK-NEXT: s_mov_b32 s14, s51
|
|
; CHECK-NEXT: s_mov_b32 s15, s50
|
|
; CHECK-NEXT: s_wait_loadcnt 0x3
|
|
; CHECK-NEXT: scratch_store_b32 off, v19, s32 offset:128
|
|
; CHECK-NEXT: s_wait_loadcnt 0x2
|
|
; CHECK-NEXT: scratch_load_b128 v[19:22], off, s33 offset:148 th:TH_LOAD_LU ; 16-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x2
|
|
; CHECK-NEXT: scratch_load_b128 v[23:26], off, s33 offset:164 th:TH_LOAD_LU ; 16-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_loadcnt 0x2
|
|
; CHECK-NEXT: s_clause 0x5 ; 96-byte Folded Reload
|
|
; CHECK-NEXT: scratch_load_b128 v[27:30], off, s33 offset:180 th:TH_LOAD_LU
|
|
; CHECK-NEXT: scratch_load_b128 v[31:34], off, s33 offset:196 th:TH_LOAD_LU
|
|
; CHECK-NEXT: scratch_load_b128 v[35:38], off, s33 offset:212 th:TH_LOAD_LU
|
|
; CHECK-NEXT: scratch_load_b128 v[39:42], off, s33 offset:228 th:TH_LOAD_LU
|
|
; CHECK-NEXT: scratch_load_b128 v[43:46], off, s33 offset:244 th:TH_LOAD_LU
|
|
; CHECK-NEXT: scratch_load_b128 v[47:50], off, s33 offset:260 th:TH_LOAD_LU
|
|
; CHECK-NEXT: s_clause 0x3
|
|
; CHECK-NEXT: scratch_store_b128 off, v[15:18], s32 offset:112
|
|
; CHECK-NEXT: scratch_store_b128 off, v[11:14], s32 offset:96
|
|
; CHECK-NEXT: scratch_store_b128 off, v[7:10], s32 offset:80
|
|
; CHECK-NEXT: scratch_store_b128 off, v[3:6], s32 offset:64
|
|
; CHECK-NEXT: v_dual_mov_b32 v4, v88 :: v_dual_mov_b32 v5, v89
|
|
; CHECK-NEXT: v_dual_mov_b32 v6, v90 :: v_dual_mov_b32 v7, v91
|
|
; CHECK-NEXT: v_dual_mov_b32 v8, v76 :: v_dual_mov_b32 v9, v77
|
|
; CHECK-NEXT: v_dual_mov_b32 v10, v78 :: v_dual_mov_b32 v11, v79
|
|
; CHECK-NEXT: v_dual_mov_b32 v12, v60 :: v_dual_mov_b32 v13, v61
|
|
; CHECK-NEXT: v_dual_mov_b32 v14, v62 :: v_dual_mov_b32 v15, v63
|
|
; CHECK-NEXT: v_dual_mov_b32 v16, v108 :: v_dual_mov_b32 v17, v109
|
|
; CHECK-NEXT: v_mov_b32_e32 v18, v110
|
|
; CHECK-NEXT: s_wait_loadcnt 0x1
|
|
; CHECK-NEXT: v_dual_mov_b32 v44, v0 :: v_dual_mov_b32 v45, v1
|
|
; CHECK-NEXT: v_mov_b32_e32 v46, v2
|
|
; CHECK-NEXT: v_dual_mov_b32 v0, v72 :: v_dual_mov_b32 v1, v73
|
|
; CHECK-NEXT: v_dual_mov_b32 v2, v74 :: v_dual_mov_b32 v3, v75
|
|
; CHECK-NEXT: v_mov_b32_e32 v43, v34
|
|
; CHECK-NEXT: v_dual_mov_b32 v42, v33 :: v_dual_mov_b32 v41, v32
|
|
; CHECK-NEXT: v_dual_mov_b32 v40, v31 :: v_dual_mov_b32 v39, v30
|
|
; CHECK-NEXT: v_dual_mov_b32 v38, v29 :: v_dual_mov_b32 v37, v28
|
|
; CHECK-NEXT: v_dual_mov_b32 v36, v27 :: v_dual_mov_b32 v35, v26
|
|
; CHECK-NEXT: v_mov_b32_e32 v34, v25
|
|
; CHECK-NEXT: v_mov_b32_e32 v33, v24
|
|
; CHECK-NEXT: v_mov_b32_e32 v32, v23
|
|
; CHECK-NEXT: v_mov_b32_e32 v31, v22
|
|
; CHECK-NEXT: v_mov_b32_e32 v30, v21
|
|
; CHECK-NEXT: v_mov_b32_e32 v29, v20
|
|
; CHECK-NEXT: v_mov_b32_e32 v28, v19
|
|
; CHECK-NEXT: s_clause 0x3
|
|
; CHECK-NEXT: scratch_store_b128 off, v[43:46], s32 offset:48
|
|
; CHECK-NEXT: scratch_store_b128 off, v[39:42], s32 offset:32
|
|
; CHECK-NEXT: scratch_store_b128 off, v[35:38], s32 offset:16
|
|
; CHECK-NEXT: scratch_store_b128 off, v[31:34], s32
|
|
; CHECK-NEXT: v_mov_b32_e32 v31, v92
|
|
; CHECK-NEXT: v_dual_mov_b32 v19, v111 :: v_dual_mov_b32 v20, v104
|
|
; CHECK-NEXT: v_dual_mov_b32 v21, v105 :: v_dual_mov_b32 v22, v106
|
|
; CHECK-NEXT: v_dual_mov_b32 v23, v107 :: v_dual_mov_b32 v24, v56
|
|
; CHECK-NEXT: v_dual_mov_b32 v25, v57 :: v_dual_mov_b32 v26, v58
|
|
; CHECK-NEXT: v_mov_b32_e32 v27, v59
|
|
; CHECK-NEXT: s_wait_kmcnt 0x0
|
|
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
|
; CHECK-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
|
; CHECK-NEXT: scratch_load_b32 v111, off, s33
|
|
; CHECK-NEXT: scratch_load_b32 v110, off, s33 offset:4
|
|
; CHECK-NEXT: scratch_load_b32 v109, off, s33 offset:8
|
|
; CHECK-NEXT: scratch_load_b32 v108, off, s33 offset:12
|
|
; CHECK-NEXT: scratch_load_b32 v107, off, s33 offset:16
|
|
; CHECK-NEXT: scratch_load_b32 v106, off, s33 offset:20
|
|
; CHECK-NEXT: scratch_load_b32 v105, off, s33 offset:24
|
|
; CHECK-NEXT: scratch_load_b32 v104, off, s33 offset:28
|
|
; CHECK-NEXT: scratch_load_b32 v92, off, s33 offset:32
|
|
; CHECK-NEXT: scratch_load_b32 v91, off, s33 offset:36
|
|
; CHECK-NEXT: scratch_load_b32 v90, off, s33 offset:40
|
|
; CHECK-NEXT: scratch_load_b32 v89, off, s33 offset:44
|
|
; CHECK-NEXT: scratch_load_b32 v88, off, s33 offset:48
|
|
; CHECK-NEXT: scratch_load_b32 v79, off, s33 offset:52
|
|
; CHECK-NEXT: scratch_load_b32 v78, off, s33 offset:56
|
|
; CHECK-NEXT: scratch_load_b32 v77, off, s33 offset:60
|
|
; CHECK-NEXT: scratch_load_b32 v76, off, s33 offset:64
|
|
; CHECK-NEXT: scratch_load_b32 v75, off, s33 offset:68
|
|
; CHECK-NEXT: scratch_load_b32 v74, off, s33 offset:72
|
|
; CHECK-NEXT: scratch_load_b32 v73, off, s33 offset:76
|
|
; CHECK-NEXT: scratch_load_b32 v72, off, s33 offset:80
|
|
; CHECK-NEXT: scratch_load_b32 v63, off, s33 offset:84
|
|
; CHECK-NEXT: scratch_load_b32 v62, off, s33 offset:88
|
|
; CHECK-NEXT: scratch_load_b32 v61, off, s33 offset:92
|
|
; CHECK-NEXT: scratch_load_b32 v60, off, s33 offset:96
|
|
; CHECK-NEXT: scratch_load_b32 v59, off, s33 offset:100
|
|
; CHECK-NEXT: scratch_load_b32 v58, off, s33 offset:104
|
|
; CHECK-NEXT: scratch_load_b32 v57, off, s33 offset:108
|
|
; CHECK-NEXT: scratch_load_b32 v56, off, s33 offset:112
|
|
; CHECK-NEXT: scratch_load_b32 v47, off, s33 offset:116
|
|
; CHECK-NEXT: scratch_load_b32 v46, off, s33 offset:120
|
|
; CHECK-NEXT: scratch_load_b32 v45, off, s33 offset:124
|
|
; CHECK-NEXT: s_clause 0x4 ; 20-byte Folded Reload
|
|
; CHECK-NEXT: scratch_load_b32 v44, off, s33 offset:128
|
|
; CHECK-NEXT: scratch_load_b32 v43, off, s33 offset:132
|
|
; CHECK-NEXT: scratch_load_b32 v42, off, s33 offset:136
|
|
; CHECK-NEXT: scratch_load_b32 v41, off, s33 offset:140
|
|
; CHECK-NEXT: scratch_load_b32 v40, off, s33 offset:144
|
|
; CHECK-NEXT: v_readlane_b32 s53, v93, 13
|
|
; CHECK-NEXT: v_readlane_b32 s52, v93, 12
|
|
; CHECK-NEXT: v_readlane_b32 s51, v93, 11
|
|
; CHECK-NEXT: v_readlane_b32 s50, v93, 10
|
|
; CHECK-NEXT: v_readlane_b32 s49, v93, 9
|
|
; CHECK-NEXT: v_readlane_b32 s48, v93, 8
|
|
; CHECK-NEXT: v_readlane_b32 s39, v93, 7
|
|
; CHECK-NEXT: v_readlane_b32 s38, v93, 6
|
|
; CHECK-NEXT: v_readlane_b32 s37, v93, 5
|
|
; CHECK-NEXT: v_readlane_b32 s36, v93, 4
|
|
; CHECK-NEXT: v_readlane_b32 s35, v93, 3
|
|
; CHECK-NEXT: v_readlane_b32 s34, v93, 2
|
|
; CHECK-NEXT: v_readlane_b32 s31, v93, 1
|
|
; CHECK-NEXT: v_readlane_b32 s30, v93, 0
|
|
; CHECK-NEXT: s_mov_b32 s32, s33
|
|
; CHECK-NEXT: v_readlane_b32 s0, v93, 14
|
|
; CHECK-NEXT: s_or_saveexec_b32 s1, -1
|
|
; CHECK-NEXT: scratch_load_b32 v93, off, s33 offset:404 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; CHECK-NEXT: s_mov_b32 exec_lo, s1
|
|
; CHECK-NEXT: s_mov_b32 s33, s0
|
|
; CHECK-NEXT: s_wait_loadcnt 0x0
|
|
; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
%A = load <32 x i64>, ptr addrspace(1) null, align 256
|
|
%B = call i32 @foo()
|
|
%C = call <8 x half> @bar(<32 x i64> %A)
|
|
ret <8 x half> %C
|
|
}
|