Fix ABI on old subtargets so match new subtargets, packing 16-bit element subvectors into 32-bit registers. Previously this would be scalarized and promoted to i32/float. Note this only changes the vector cases. Scalar i16/half are still promoted to i32/float for now. I've unsuccessfully tried to make that switch in the past, so leave that for later. This will help with removal of softPromoteHalfType.
1093 lines
47 KiB
LLVM
1093 lines
47 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIJI %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HAWAII %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
|
|
|
; FIXME: Why is this commuted only sometimes?
|
|
define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
|
|
; FIJI-LABEL: i32_fastcc_i32_i32:
|
|
; FIJI: ; %bb.0:
|
|
; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
|
|
; FIJI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; HAWAII-LABEL: i32_fastcc_i32_i32:
|
|
; HAWAII: ; %bb.0:
|
|
; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
|
; HAWAII-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: i32_fastcc_i32_i32:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%add0 = add i32 %arg0, %arg1
|
|
ret i32 %add0
|
|
}
|
|
|
|
define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
|
|
; FIJI-LABEL: i32_fastcc_i32_i32_stack_object:
|
|
; FIJI: ; %bb.0:
|
|
; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FIJI-NEXT: v_mov_b32_e32 v2, 9
|
|
; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
|
|
; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20
|
|
; FIJI-NEXT: s_waitcnt vmcnt(0)
|
|
; FIJI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; HAWAII-LABEL: i32_fastcc_i32_i32_stack_object:
|
|
; HAWAII: ; %bb.0:
|
|
; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; HAWAII-NEXT: v_mov_b32_e32 v2, 9
|
|
; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
|
; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20
|
|
; HAWAII-NEXT: s_waitcnt vmcnt(0)
|
|
; HAWAII-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: i32_fastcc_i32_i32_stack_object:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, 9
|
|
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
|
|
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%add0 = add i32 %arg0, %arg1
|
|
ret i32 %add0
|
|
}
|
|
|
|
define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
|
|
; GCN-LABEL: sibling_call_i32_fastcc_i32_i32:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12
|
|
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_setpc_b64 s[4:5]
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
ret i32 %ret
|
|
}
|
|
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
|
|
; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12
|
|
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 9
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_setpc_b64 s[4:5]
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
ret i32 %ret
|
|
}
|
|
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 {
|
|
; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object@gotpcrel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object@gotpcrel32@hi+12
|
|
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 9
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_setpc_b64 s[4:5]
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b)
|
|
ret i32 %ret
|
|
}
|
|
|
|
define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
|
|
; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12
|
|
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_setpc_b64 s[4:5]
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
ret void
|
|
}
|
|
|
|
; It doesn't make sense to do a tail from a kernel
|
|
define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
|
|
; FIJI-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result:
|
|
; FIJI: ; %bb.0: ; %entry
|
|
; FIJI-NEXT: s_add_i32 s6, s6, s9
|
|
; FIJI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
|
|
; FIJI-NEXT: s_add_u32 s0, s0, s9
|
|
; FIJI-NEXT: s_addc_u32 s1, s1, 0
|
|
; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s7
|
|
; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; FIJI-NEXT: s_getpc_b64 s[6:7]
|
|
; FIJI-NEXT: s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4
|
|
; FIJI-NEXT: s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12
|
|
; FIJI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
|
; FIJI-NEXT: s_mov_b32 s32, 0
|
|
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; FIJI-NEXT: v_mov_b32_e32 v0, s4
|
|
; FIJI-NEXT: v_mov_b32_e32 v1, s5
|
|
; FIJI-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
|
; FIJI-NEXT: s_endpgm
|
|
;
|
|
; HAWAII-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result:
|
|
; HAWAII: ; %bb.0: ; %entry
|
|
; HAWAII-NEXT: s_add_i32 s6, s6, s9
|
|
; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
|
|
; HAWAII-NEXT: s_add_u32 s0, s0, s9
|
|
; HAWAII-NEXT: s_addc_u32 s1, s1, 0
|
|
; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s7
|
|
; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; HAWAII-NEXT: s_getpc_b64 s[6:7]
|
|
; HAWAII-NEXT: s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4
|
|
; HAWAII-NEXT: s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12
|
|
; HAWAII-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
|
; HAWAII-NEXT: s_mov_b32 s32, 0
|
|
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
|
|
; HAWAII-NEXT: v_mov_b32_e32 v0, s4
|
|
; HAWAII-NEXT: v_mov_b32_e32 v1, s5
|
|
; HAWAII-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
|
; HAWAII-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9
|
|
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
|
|
; GFX9-NEXT: s_add_u32 s0, s0, s9
|
|
; GFX9-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; GFX9-NEXT: s_getpc_b64 s[6:7]
|
|
; GFX9-NEXT: s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4
|
|
; GFX9-NEXT: s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12
|
|
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
|
; GFX9-NEXT: s_mov_b32 s32, 0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
|
; GFX9-NEXT: s_endpgm
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
ret void
|
|
}
|
|
|
|
define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) byval(i32) align 4 %arg1) #1 {
|
|
; FIJI-LABEL: i32_fastcc_i32_byval_i32:
|
|
; FIJI: ; %bb.0:
|
|
; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FIJI-NEXT: buffer_load_dword v1, off, s[0:3], s32
|
|
; FIJI-NEXT: s_waitcnt vmcnt(0)
|
|
; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
|
|
; FIJI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; HAWAII-LABEL: i32_fastcc_i32_byval_i32:
|
|
; HAWAII: ; %bb.0:
|
|
; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; HAWAII-NEXT: buffer_load_dword v1, off, s[0:3], s32
|
|
; HAWAII-NEXT: s_waitcnt vmcnt(0)
|
|
; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
|
; HAWAII-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: i32_fastcc_i32_byval_i32:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%arg1.load = load i32, ptr addrspace(5) %arg1, align 4
|
|
%add0 = add i32 %arg0, %arg1.load
|
|
ret i32 %add0
|
|
}
|
|
|
|
; Tail call disallowed with byval in parent.
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, ptr addrspace(5) byval(i32) %b.byval, i32 %c) #1 {
|
|
; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s4, s33
|
|
; GCN-NEXT: s_mov_b32 s33, s32
|
|
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
|
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33
|
|
; GCN-NEXT: v_writelane_b32 v40, s4, 2
|
|
; GCN-NEXT: s_addk_i32 s32, 0x400
|
|
; GCN-NEXT: v_writelane_b32 v40, s30, 0
|
|
; GCN-NEXT: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12
|
|
; GCN-NEXT: v_writelane_b32 v40, s31, 1
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; GCN-NEXT: v_readlane_b32 s31, v40, 1
|
|
; GCN-NEXT: v_readlane_b32 s30, v40, 0
|
|
; GCN-NEXT: s_mov_b32 s32, s33
|
|
; GCN-NEXT: v_readlane_b32 s4, v40, 2
|
|
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
|
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-NEXT: s_mov_b32 s33, s4
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) %b.byval)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; Tail call disallowed with byval in parent, not callee. The stack
|
|
; usage of incoming arguments must be <= the outgoing stack
|
|
; arguments.
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 {
|
|
; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
|
|
; GCN-NEXT: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
; GCN-NEXT: s_setpc_b64 s[4:5]
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) inttoptr (i32 16 to ptr addrspace(5)))
|
|
ret i32 %ret
|
|
}
|
|
|
|
define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 {
|
|
; FIJI-LABEL: i32_fastcc_i32_i32_a32i32:
|
|
; FIJI: ; %bb.0:
|
|
; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FIJI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
|
|
; FIJI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
|
|
; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
|
|
; FIJI-NEXT: s_waitcnt vmcnt(1)
|
|
; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
|
|
; FIJI-NEXT: s_waitcnt vmcnt(0)
|
|
; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v3
|
|
; FIJI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; HAWAII-LABEL: i32_fastcc_i32_i32_a32i32:
|
|
; HAWAII: ; %bb.0:
|
|
; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; HAWAII-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
|
|
; HAWAII-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
|
|
; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
|
; HAWAII-NEXT: s_waitcnt vmcnt(1)
|
|
; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v2
|
|
; HAWAII-NEXT: s_waitcnt vmcnt(0)
|
|
; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v3
|
|
; HAWAII-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: i32_fastcc_i32_i32_a32i32:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
|
|
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4
|
|
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: v_add3_u32 v0, v0, v3, v2
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%val_firststack = extractvalue [32 x i32] %large, 30
|
|
%val_laststack = extractvalue [32 x i32] %large, 31
|
|
%add0 = add i32 %arg0, %arg1
|
|
%add1 = add i32 %add0, %val_firststack
|
|
%add2 = add i32 %add1, %val_laststack
|
|
ret i32 %add2
|
|
}
|
|
|
|
; FIXME: Why load and store same location for stack args?
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
|
; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
|
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
|
|
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
|
|
; GCN-NEXT: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12
|
|
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; GCN-NEXT: s_waitcnt vmcnt(2)
|
|
; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32
|
|
; GCN-NEXT: s_waitcnt vmcnt(2)
|
|
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4
|
|
; GCN-NEXT: s_waitcnt vmcnt(2)
|
|
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_setpc_b64 s[4:5]
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
|
|
ret i32 %ret
|
|
}
|
|
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
|
; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
|
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
|
|
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
|
|
; GCN-NEXT: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12
|
|
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; GCN-NEXT: v_mov_b32_e32 v34, 9
|
|
; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32
|
|
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4
|
|
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_setpc_b64 s[4:5]
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; If the callee requires more stack argument space than the caller,
|
|
; don't do a tail call.
|
|
; TODO: Do we really need this restriction?
|
|
define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
|
|
; GCN-LABEL: no_sibling_call_callee_more_stack_space:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s4, s33
|
|
; GCN-NEXT: s_mov_b32 s33, s32
|
|
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
|
|
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-NEXT: s_addk_i32 s32, 0x400
|
|
; GCN-NEXT: v_writelane_b32 v40, s4, 2
|
|
; GCN-NEXT: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12
|
|
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
|
; GCN-NEXT: v_writelane_b32 v40, s30, 0
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v3, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v4, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v5, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v6, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v7, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v8, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v9, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v10, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v11, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v12, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v13, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v14, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v15, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v16, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v17, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v18, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v19, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v20, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v21, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v22, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v23, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v24, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v25, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v26, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v27, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v28, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v29, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v30, 0
|
|
; GCN-NEXT: v_writelane_b32 v40, s31, 1
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; GCN-NEXT: v_readlane_b32 s31, v40, 1
|
|
; GCN-NEXT: v_readlane_b32 s30, v40, 0
|
|
; GCN-NEXT: s_mov_b32 s32, s33
|
|
; GCN-NEXT: v_readlane_b32 s4, v40, 2
|
|
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
|
|
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-NEXT: s_mov_b32 s33, s4
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; Have another non-tail in the function
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
|
|
; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_other_call:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s4, s33
|
|
; GCN-NEXT: s_mov_b32 s33, s32
|
|
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
|
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GCN-NEXT: s_addk_i32 s32, 0x400
|
|
; GCN-NEXT: v_writelane_b32 v42, s4, 2
|
|
; GCN-NEXT: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12
|
|
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; GCN-NEXT: v_writelane_b32 v42, s30, 0
|
|
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
|
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
|
|
; GCN-NEXT: v_writelane_b32 v42, s31, 1
|
|
; GCN-NEXT: v_mov_b32_e32 v40, v1
|
|
; GCN-NEXT: v_mov_b32_e32 v41, v0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v41
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v40
|
|
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
|
|
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
|
; GCN-NEXT: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12
|
|
; GCN-NEXT: v_readlane_b32 s31, v42, 1
|
|
; GCN-NEXT: v_readlane_b32 s30, v42, 0
|
|
; GCN-NEXT: s_mov_b32 s32, s33
|
|
; GCN-NEXT: v_readlane_b32 s6, v42, 2
|
|
; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
|
|
; GCN-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-NEXT: s_mov_b32 s33, s6
|
|
; GCN-NEXT: s_setpc_b64 s[4:5]
|
|
entry:
|
|
%other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
%ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; Have stack object in caller and stack passed arguments. SP should be
|
|
; in same place at function exit.
|
|
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
|
; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
|
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
|
|
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
|
|
; GCN-NEXT: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12
|
|
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; GCN-NEXT: v_mov_b32_e32 v34, 9
|
|
; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32
|
|
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4
|
|
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_setpc_b64 s[4:5]
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
|
|
ret i32 %ret
|
|
}
|
|
|
|
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
|
|
; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12
|
|
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 9
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v3, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v4, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v5, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v6, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v7, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v8, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v9, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v10, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v11, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v12, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v13, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v14, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v15, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v16, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v17, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v18, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v19, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v20, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v21, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v22, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v23, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v24, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v25, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v26, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v27, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v28, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v29, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v30, 0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_setpc_b64 s[4:5]
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
|
|
ret i32 %ret
|
|
}
|
|
|
|
@func_ptr_gv = external unnamed_addr addrspace(4) constant ptr, align 4
|
|
|
|
; Do support tail calls with a uniform, but unknown, callee.
|
|
define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
|
|
; GCN-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_getpc_b64 s[16:17]
|
|
; GCN-NEXT: s_add_u32 s16, s16, func_ptr_gv@gotpcrel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s17, s17, func_ptr_gv@gotpcrel32@hi+12
|
|
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_setpc_b64 s[16:17]
|
|
entry:
|
|
%func.ptr.load = load ptr, ptr addrspace(4) @func_ptr_gv
|
|
%ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; We can't support a tail call to a divergent target. Use a waterfall
|
|
; loop around a regular call
|
|
define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr %func.ptr, i32 %a, i32 %b, i32 %c) #1 {
|
|
; FIJI-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32:
|
|
; FIJI: ; %bb.0: ; %entry
|
|
; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FIJI-NEXT: s_mov_b32 s16, s33
|
|
; FIJI-NEXT: s_mov_b32 s33, s32
|
|
; FIJI-NEXT: s_or_saveexec_b64 s[18:19], -1
|
|
; FIJI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
|
|
; FIJI-NEXT: s_mov_b64 exec, s[18:19]
|
|
; FIJI-NEXT: v_writelane_b32 v40, s16, 18
|
|
; FIJI-NEXT: v_writelane_b32 v40, s30, 0
|
|
; FIJI-NEXT: v_writelane_b32 v40, s31, 1
|
|
; FIJI-NEXT: v_writelane_b32 v40, s34, 2
|
|
; FIJI-NEXT: v_writelane_b32 v40, s35, 3
|
|
; FIJI-NEXT: v_writelane_b32 v40, s36, 4
|
|
; FIJI-NEXT: v_writelane_b32 v40, s37, 5
|
|
; FIJI-NEXT: v_writelane_b32 v40, s38, 6
|
|
; FIJI-NEXT: v_writelane_b32 v40, s39, 7
|
|
; FIJI-NEXT: v_writelane_b32 v40, s48, 8
|
|
; FIJI-NEXT: v_writelane_b32 v40, s49, 9
|
|
; FIJI-NEXT: v_writelane_b32 v40, s50, 10
|
|
; FIJI-NEXT: v_writelane_b32 v40, s51, 11
|
|
; FIJI-NEXT: v_writelane_b32 v40, s52, 12
|
|
; FIJI-NEXT: v_writelane_b32 v40, s53, 13
|
|
; FIJI-NEXT: v_writelane_b32 v40, s54, 14
|
|
; FIJI-NEXT: v_writelane_b32 v40, s55, 15
|
|
; FIJI-NEXT: v_writelane_b32 v40, s64, 16
|
|
; FIJI-NEXT: s_mov_b32 s50, s15
|
|
; FIJI-NEXT: s_mov_b32 s51, s14
|
|
; FIJI-NEXT: s_mov_b32 s52, s13
|
|
; FIJI-NEXT: s_mov_b32 s53, s12
|
|
; FIJI-NEXT: s_mov_b64 s[34:35], s[10:11]
|
|
; FIJI-NEXT: s_mov_b64 s[36:37], s[8:9]
|
|
; FIJI-NEXT: s_mov_b64 s[38:39], s[6:7]
|
|
; FIJI-NEXT: s_mov_b64 s[48:49], s[4:5]
|
|
; FIJI-NEXT: v_add_u32_e32 v3, vcc, v3, v4
|
|
; FIJI-NEXT: s_mov_b64 s[54:55], exec
|
|
; FIJI-NEXT: s_addk_i32 s32, 0x400
|
|
; FIJI-NEXT: v_writelane_b32 v40, s65, 17
|
|
; FIJI-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
|
|
; FIJI-NEXT: v_readfirstlane_b32 s16, v0
|
|
; FIJI-NEXT: v_readfirstlane_b32 s17, v1
|
|
; FIJI-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
|
|
; FIJI-NEXT: s_and_saveexec_b64 s[64:65], vcc
|
|
; FIJI-NEXT: s_mov_b64 s[4:5], s[48:49]
|
|
; FIJI-NEXT: s_mov_b64 s[6:7], s[38:39]
|
|
; FIJI-NEXT: s_mov_b64 s[8:9], s[36:37]
|
|
; FIJI-NEXT: s_mov_b64 s[10:11], s[34:35]
|
|
; FIJI-NEXT: s_mov_b32 s12, s53
|
|
; FIJI-NEXT: s_mov_b32 s13, s52
|
|
; FIJI-NEXT: s_mov_b32 s14, s51
|
|
; FIJI-NEXT: s_mov_b32 s15, s50
|
|
; FIJI-NEXT: v_mov_b32_e32 v0, v2
|
|
; FIJI-NEXT: v_mov_b32_e32 v1, v3
|
|
; FIJI-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
|
; FIJI-NEXT: v_mov_b32_e32 v4, v0
|
|
; FIJI-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; FIJI-NEXT: ; implicit-def: $vgpr31
|
|
; FIJI-NEXT: ; implicit-def: $vgpr2
|
|
; FIJI-NEXT: ; implicit-def: $vgpr3
|
|
; FIJI-NEXT: s_xor_b64 exec, exec, s[64:65]
|
|
; FIJI-NEXT: s_cbranch_execnz .LBB18_1
|
|
; FIJI-NEXT: ; %bb.2:
|
|
; FIJI-NEXT: s_mov_b64 exec, s[54:55]
|
|
; FIJI-NEXT: v_mov_b32_e32 v0, v4
|
|
; FIJI-NEXT: v_readlane_b32 s65, v40, 17
|
|
; FIJI-NEXT: v_readlane_b32 s64, v40, 16
|
|
; FIJI-NEXT: v_readlane_b32 s55, v40, 15
|
|
; FIJI-NEXT: v_readlane_b32 s54, v40, 14
|
|
; FIJI-NEXT: v_readlane_b32 s53, v40, 13
|
|
; FIJI-NEXT: v_readlane_b32 s52, v40, 12
|
|
; FIJI-NEXT: v_readlane_b32 s51, v40, 11
|
|
; FIJI-NEXT: v_readlane_b32 s50, v40, 10
|
|
; FIJI-NEXT: v_readlane_b32 s49, v40, 9
|
|
; FIJI-NEXT: v_readlane_b32 s48, v40, 8
|
|
; FIJI-NEXT: v_readlane_b32 s39, v40, 7
|
|
; FIJI-NEXT: v_readlane_b32 s38, v40, 6
|
|
; FIJI-NEXT: v_readlane_b32 s37, v40, 5
|
|
; FIJI-NEXT: v_readlane_b32 s36, v40, 4
|
|
; FIJI-NEXT: v_readlane_b32 s35, v40, 3
|
|
; FIJI-NEXT: v_readlane_b32 s34, v40, 2
|
|
; FIJI-NEXT: v_readlane_b32 s31, v40, 1
|
|
; FIJI-NEXT: v_readlane_b32 s30, v40, 0
|
|
; FIJI-NEXT: s_mov_b32 s32, s33
|
|
; FIJI-NEXT: v_readlane_b32 s4, v40, 18
|
|
; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; FIJI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
|
|
; FIJI-NEXT: s_mov_b64 exec, s[6:7]
|
|
; FIJI-NEXT: s_mov_b32 s33, s4
|
|
; FIJI-NEXT: s_waitcnt vmcnt(0)
|
|
; FIJI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; HAWAII-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32:
|
|
; HAWAII: ; %bb.0: ; %entry
|
|
; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; HAWAII-NEXT: s_mov_b32 s16, s33
|
|
; HAWAII-NEXT: s_mov_b32 s33, s32
|
|
; HAWAII-NEXT: s_or_saveexec_b64 s[18:19], -1
|
|
; HAWAII-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
|
|
; HAWAII-NEXT: s_mov_b64 exec, s[18:19]
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s16, 18
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s30, 0
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s31, 1
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s34, 2
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s35, 3
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s36, 4
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s37, 5
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s38, 6
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s39, 7
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s48, 8
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s49, 9
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s50, 10
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s51, 11
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s52, 12
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s53, 13
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s54, 14
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s55, 15
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s64, 16
|
|
; HAWAII-NEXT: s_mov_b32 s50, s15
|
|
; HAWAII-NEXT: s_mov_b32 s51, s14
|
|
; HAWAII-NEXT: s_mov_b32 s52, s13
|
|
; HAWAII-NEXT: s_mov_b32 s53, s12
|
|
; HAWAII-NEXT: s_mov_b64 s[34:35], s[10:11]
|
|
; HAWAII-NEXT: s_mov_b64 s[36:37], s[8:9]
|
|
; HAWAII-NEXT: s_mov_b64 s[38:39], s[6:7]
|
|
; HAWAII-NEXT: s_mov_b64 s[48:49], s[4:5]
|
|
; HAWAII-NEXT: v_add_i32_e32 v3, vcc, v3, v4
|
|
; HAWAII-NEXT: s_mov_b64 s[54:55], exec
|
|
; HAWAII-NEXT: s_addk_i32 s32, 0x400
|
|
; HAWAII-NEXT: v_writelane_b32 v40, s65, 17
|
|
; HAWAII-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
|
|
; HAWAII-NEXT: v_readfirstlane_b32 s16, v0
|
|
; HAWAII-NEXT: v_readfirstlane_b32 s17, v1
|
|
; HAWAII-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
|
|
; HAWAII-NEXT: s_and_saveexec_b64 s[64:65], vcc
|
|
; HAWAII-NEXT: s_mov_b64 s[4:5], s[48:49]
|
|
; HAWAII-NEXT: s_mov_b64 s[6:7], s[38:39]
|
|
; HAWAII-NEXT: s_mov_b64 s[8:9], s[36:37]
|
|
; HAWAII-NEXT: s_mov_b64 s[10:11], s[34:35]
|
|
; HAWAII-NEXT: s_mov_b32 s12, s53
|
|
; HAWAII-NEXT: s_mov_b32 s13, s52
|
|
; HAWAII-NEXT: s_mov_b32 s14, s51
|
|
; HAWAII-NEXT: s_mov_b32 s15, s50
|
|
; HAWAII-NEXT: v_mov_b32_e32 v0, v2
|
|
; HAWAII-NEXT: v_mov_b32_e32 v1, v3
|
|
; HAWAII-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
|
; HAWAII-NEXT: v_mov_b32_e32 v4, v0
|
|
; HAWAII-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; HAWAII-NEXT: ; implicit-def: $vgpr31
|
|
; HAWAII-NEXT: ; implicit-def: $vgpr2
|
|
; HAWAII-NEXT: ; implicit-def: $vgpr3
|
|
; HAWAII-NEXT: s_xor_b64 exec, exec, s[64:65]
|
|
; HAWAII-NEXT: s_cbranch_execnz .LBB18_1
|
|
; HAWAII-NEXT: ; %bb.2:
|
|
; HAWAII-NEXT: s_mov_b64 exec, s[54:55]
|
|
; HAWAII-NEXT: v_mov_b32_e32 v0, v4
|
|
; HAWAII-NEXT: v_readlane_b32 s65, v40, 17
|
|
; HAWAII-NEXT: v_readlane_b32 s64, v40, 16
|
|
; HAWAII-NEXT: v_readlane_b32 s55, v40, 15
|
|
; HAWAII-NEXT: v_readlane_b32 s54, v40, 14
|
|
; HAWAII-NEXT: v_readlane_b32 s53, v40, 13
|
|
; HAWAII-NEXT: v_readlane_b32 s52, v40, 12
|
|
; HAWAII-NEXT: v_readlane_b32 s51, v40, 11
|
|
; HAWAII-NEXT: v_readlane_b32 s50, v40, 10
|
|
; HAWAII-NEXT: v_readlane_b32 s49, v40, 9
|
|
; HAWAII-NEXT: v_readlane_b32 s48, v40, 8
|
|
; HAWAII-NEXT: v_readlane_b32 s39, v40, 7
|
|
; HAWAII-NEXT: v_readlane_b32 s38, v40, 6
|
|
; HAWAII-NEXT: v_readlane_b32 s37, v40, 5
|
|
; HAWAII-NEXT: v_readlane_b32 s36, v40, 4
|
|
; HAWAII-NEXT: v_readlane_b32 s35, v40, 3
|
|
; HAWAII-NEXT: v_readlane_b32 s34, v40, 2
|
|
; HAWAII-NEXT: v_readlane_b32 s31, v40, 1
|
|
; HAWAII-NEXT: v_readlane_b32 s30, v40, 0
|
|
; HAWAII-NEXT: s_mov_b32 s32, s33
|
|
; HAWAII-NEXT: v_readlane_b32 s4, v40, 18
|
|
; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; HAWAII-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
|
|
; HAWAII-NEXT: s_mov_b64 exec, s[6:7]
|
|
; HAWAII-NEXT: s_mov_b32 s33, s4
|
|
; HAWAII-NEXT: s_waitcnt vmcnt(0)
|
|
; HAWAII-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: s_mov_b32 s16, s33
|
|
; GFX9-NEXT: s_mov_b32 s33, s32
|
|
; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1
|
|
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
|
|
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
|
|
; GFX9-NEXT: v_writelane_b32 v40, s16, 18
|
|
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
|
|
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
|
|
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
|
|
; GFX9-NEXT: v_writelane_b32 v40, s35, 3
|
|
; GFX9-NEXT: v_writelane_b32 v40, s36, 4
|
|
; GFX9-NEXT: v_writelane_b32 v40, s37, 5
|
|
; GFX9-NEXT: v_writelane_b32 v40, s38, 6
|
|
; GFX9-NEXT: v_writelane_b32 v40, s39, 7
|
|
; GFX9-NEXT: v_writelane_b32 v40, s48, 8
|
|
; GFX9-NEXT: v_writelane_b32 v40, s49, 9
|
|
; GFX9-NEXT: v_writelane_b32 v40, s50, 10
|
|
; GFX9-NEXT: v_writelane_b32 v40, s51, 11
|
|
; GFX9-NEXT: v_writelane_b32 v40, s52, 12
|
|
; GFX9-NEXT: v_writelane_b32 v40, s53, 13
|
|
; GFX9-NEXT: v_writelane_b32 v40, s54, 14
|
|
; GFX9-NEXT: v_writelane_b32 v40, s55, 15
|
|
; GFX9-NEXT: v_writelane_b32 v40, s64, 16
|
|
; GFX9-NEXT: s_mov_b32 s50, s15
|
|
; GFX9-NEXT: s_mov_b32 s51, s14
|
|
; GFX9-NEXT: s_mov_b32 s52, s13
|
|
; GFX9-NEXT: s_mov_b32 s53, s12
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], s[10:11]
|
|
; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9]
|
|
; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7]
|
|
; GFX9-NEXT: s_mov_b64 s[48:49], s[4:5]
|
|
; GFX9-NEXT: v_add_u32_e32 v3, v3, v4
|
|
; GFX9-NEXT: s_mov_b64 s[54:55], exec
|
|
; GFX9-NEXT: s_addk_i32 s32, 0x400
|
|
; GFX9-NEXT: v_writelane_b32 v40, s65, 17
|
|
; GFX9-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: v_readfirstlane_b32 s16, v0
|
|
; GFX9-NEXT: v_readfirstlane_b32 s17, v1
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
|
|
; GFX9-NEXT: s_and_saveexec_b64 s[64:65], vcc
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49]
|
|
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
|
|
; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37]
|
|
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
|
|
; GFX9-NEXT: s_mov_b32 s12, s53
|
|
; GFX9-NEXT: s_mov_b32 s13, s52
|
|
; GFX9-NEXT: s_mov_b32 s14, s51
|
|
; GFX9-NEXT: s_mov_b32 s15, s50
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v2
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v3
|
|
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, v0
|
|
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX9-NEXT: ; implicit-def: $vgpr31
|
|
; GFX9-NEXT: ; implicit-def: $vgpr2
|
|
; GFX9-NEXT: ; implicit-def: $vgpr3
|
|
; GFX9-NEXT: s_xor_b64 exec, exec, s[64:65]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB18_1
|
|
; GFX9-NEXT: ; %bb.2:
|
|
; GFX9-NEXT: s_mov_b64 exec, s[54:55]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_readlane_b32 s65, v40, 17
|
|
; GFX9-NEXT: v_readlane_b32 s64, v40, 16
|
|
; GFX9-NEXT: v_readlane_b32 s55, v40, 15
|
|
; GFX9-NEXT: v_readlane_b32 s54, v40, 14
|
|
; GFX9-NEXT: v_readlane_b32 s53, v40, 13
|
|
; GFX9-NEXT: v_readlane_b32 s52, v40, 12
|
|
; GFX9-NEXT: v_readlane_b32 s51, v40, 11
|
|
; GFX9-NEXT: v_readlane_b32 s50, v40, 10
|
|
; GFX9-NEXT: v_readlane_b32 s49, v40, 9
|
|
; GFX9-NEXT: v_readlane_b32 s48, v40, 8
|
|
; GFX9-NEXT: v_readlane_b32 s39, v40, 7
|
|
; GFX9-NEXT: v_readlane_b32 s38, v40, 6
|
|
; GFX9-NEXT: v_readlane_b32 s37, v40, 5
|
|
; GFX9-NEXT: v_readlane_b32 s36, v40, 4
|
|
; GFX9-NEXT: v_readlane_b32 s35, v40, 3
|
|
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
|
|
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
|
|
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
|
|
; GFX9-NEXT: s_mov_b32 s32, s33
|
|
; GFX9-NEXT: v_readlane_b32 s4, v40, 18
|
|
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
|
|
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GFX9-NEXT: s_mov_b32 s33, s4
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
%add = add i32 %b, %c
|
|
%ret = tail call fastcc i32 %func.ptr(i32 %a, i32 %add)
|
|
ret i32 %ret
|
|
}
|
|
|
|
declare hidden fastcc void @void_fastcc_multi_byval(i32 %a, ptr addrspace(5) byval([3 x i32]) align 16, ptr addrspace(5) byval([2 x i64]))
|
|
|
|
define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
|
|
; GCN-LABEL: sibling_call_fastcc_multi_byval:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 9
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
|
; GCN-NEXT: s_getpc_b64 s[16:17]
|
|
; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8
|
|
; GCN-NEXT: s_setpc_b64 s[16:17]
|
|
entry:
|
|
%alloca0 = alloca [3 x i32], align 16, addrspace(5)
|
|
%alloca1 = alloca [2 x i64], align 8, addrspace(5)
|
|
store [3 x i32] [i32 9, i32 9, i32 9], ptr addrspace(5) %alloca0
|
|
store [2 x i64] zeroinitializer, ptr addrspace(5) %alloca1
|
|
tail call fastcc void @void_fastcc_multi_byval(i32 %a, ptr addrspace(5) byval([3 x i32]) %alloca0, ptr addrspace(5) byval([2 x i64]) %alloca1)
|
|
ret void
|
|
}
|
|
|
|
declare hidden fastcc void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([3 x i32]) align 16, [32 x i32], i32)
|
|
|
|
; Callee has a byval and non-byval stack passed argument
|
|
define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 {
|
|
; GCN-LABEL: sibling_call_byval_and_stack_passed:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 9
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
|
|
; GCN-NEXT: s_getpc_b64 s[16:17]
|
|
; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v3, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v4, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v5, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v6, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v7, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v8, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v9, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v10, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v11, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v12, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v13, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v14, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v15, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v16, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v17, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v18, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v19, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v20, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v21, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v22, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v23, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v24, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v25, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v26, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v27, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v28, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v29, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v30, 0
|
|
; GCN-NEXT: s_setpc_b64 s[16:17]
|
|
entry:
|
|
%alloca = alloca [3 x i32], align 16, addrspace(5)
|
|
store [3 x i32] [i32 9, i32 9, i32 9], ptr addrspace(5) %alloca
|
|
tail call fastcc void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([3 x i32]) %alloca, [32 x i32] zeroinitializer, i32 %stack.out.arg)
|
|
ret void
|
|
}
|
|
|
|
declare hidden fastcc i64 @i64_fastcc_i64(i64 %arg0)
|
|
|
|
define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 {
|
|
; GCN-LABEL: sibling_call_i64_fastcc_i64:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_getpc_b64 s[16:17]
|
|
; GCN-NEXT: s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12
|
|
; GCN-NEXT: s_setpc_b64 s[16:17]
|
|
entry:
|
|
%ret = tail call fastcc i64 @i64_fastcc_i64(i64 %a)
|
|
ret i64 %ret
|
|
}
|
|
|
|
declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0)
|
|
|
|
define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspace(1) %a) #1 {
|
|
; GCN-LABEL: sibling_call_p1i8_fastcc_p1i8:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_getpc_b64 s[16:17]
|
|
; GCN-NEXT: s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12
|
|
; GCN-NEXT: s_setpc_b64 s[16:17]
|
|
entry:
|
|
%ret = tail call fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %a)
|
|
ret ptr addrspace(1) %ret
|
|
}
|
|
|
|
declare hidden fastcc i16 @i16_fastcc_i16(i16 %arg0)
|
|
|
|
define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 {
|
|
; GCN-LABEL: sibling_call_i16_fastcc_i16:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_getpc_b64 s[16:17]
|
|
; GCN-NEXT: s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12
|
|
; GCN-NEXT: s_setpc_b64 s[16:17]
|
|
entry:
|
|
%ret = tail call fastcc i16 @i16_fastcc_i16(i16 %a)
|
|
ret i16 %ret
|
|
}
|
|
|
|
declare hidden fastcc half @f16_fastcc_f16(half %arg0)
|
|
|
|
define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 {
|
|
; GCN-LABEL: sibling_call_f16_fastcc_f16:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_getpc_b64 s[16:17]
|
|
; GCN-NEXT: s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12
|
|
; GCN-NEXT: s_setpc_b64 s[16:17]
|
|
entry:
|
|
%ret = tail call fastcc half @f16_fastcc_f16(half %a)
|
|
ret half %ret
|
|
}
|
|
|
|
declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0)
|
|
|
|
define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 {
|
|
; FIJI-LABEL: sibling_call_v3i16_fastcc_v3i16:
|
|
; FIJI: ; %bb.0: ; %entry
|
|
; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FIJI-NEXT: s_getpc_b64 s[16:17]
|
|
; FIJI-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4
|
|
; FIJI-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12
|
|
; FIJI-NEXT: s_setpc_b64 s[16:17]
|
|
;
|
|
; HAWAII-LABEL: sibling_call_v3i16_fastcc_v3i16:
|
|
; HAWAII: ; %bb.0: ; %entry
|
|
; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; HAWAII-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
|
; HAWAII-NEXT: s_getpc_b64 s[16:17]
|
|
; HAWAII-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4
|
|
; HAWAII-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12
|
|
; HAWAII-NEXT: s_setpc_b64 s[16:17]
|
|
;
|
|
; GFX9-LABEL: sibling_call_v3i16_fastcc_v3i16:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: s_getpc_b64 s[16:17]
|
|
; GFX9-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4
|
|
; GFX9-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12
|
|
; GFX9-NEXT: s_setpc_b64 s[16:17]
|
|
entry:
|
|
%ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a)
|
|
ret <3 x i16> %ret
|
|
}
|
|
|
|
declare hidden fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %arg0)
|
|
|
|
define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 {
|
|
; GCN-LABEL: sibling_call_v4i16_fastcc_v4i16:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_getpc_b64 s[16:17]
|
|
; GCN-NEXT: s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12
|
|
; GCN-NEXT: s_setpc_b64 s[16:17]
|
|
entry:
|
|
%ret = tail call fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %a)
|
|
ret <4 x i16> %ret
|
|
}
|
|
|
|
declare hidden fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %arg0)
|
|
|
|
define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 {
|
|
; GCN-LABEL: sibling_call_v2i64_fastcc_v2i64:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: s_getpc_b64 s[16:17]
|
|
; GCN-NEXT: s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12
|
|
; GCN-NEXT: s_setpc_b64 s[16:17]
|
|
entry:
|
|
%ret = tail call fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %a)
|
|
ret <2 x i64> %ret
|
|
}
|
|
|
|
attributes #0 = { nounwind }
|
|
attributes #1 = { nounwind noinline "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
|