[AMDGPU] Move LowerBufferFatPointers after LoadStoreVectorizer and remove the fixme (#161531)
Move LowerBufferFatPointers pass after CodegenPrepare and LoadStoreVectorizer pass, and remove the fixme about that.
This commit is contained in:
parent
bdea159093
commit
640644d68a
@ -1384,6 +1384,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
|
||||
if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
|
||||
addPass(createAMDGPULowerKernelArgumentsPass());
|
||||
|
||||
TargetPassConfig::addCodeGenPrepare();
|
||||
|
||||
if (isPassEnabled(EnableLoadStoreVectorizer))
|
||||
addPass(createLoadStoreVectorizerPass());
|
||||
|
||||
if (TM->getTargetTriple().isAMDGCN()) {
|
||||
// This lowering has been placed after codegenprepare to take advantage of
|
||||
// address mode matching (which is why it isn't put with the LDS lowerings).
|
||||
@ -1392,15 +1397,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
|
||||
// but has been put before switch lowering and CFG flattening so that those
|
||||
// passes can run on the more optimized control flow this pass creates in
|
||||
// many cases.
|
||||
//
|
||||
// FIXME: This should ideally be put after the LoadStoreVectorizer.
|
||||
// However, due to some annoying facts about ResourceUsageAnalysis,
|
||||
// (especially as exercised in the resource-usage-dead-function test),
|
||||
// we need all the function passes codegenprepare all the way through
|
||||
// said resource usage analysis to run on the call graph produced
|
||||
// before codegenprepare runs (because codegenprepare will knock some
|
||||
// nodes out of the graph, which leads to function-level passes not
|
||||
// being run on them, which causes crashes in the resource usage analysis).
|
||||
addPass(createAMDGPULowerBufferFatPointersPass());
|
||||
addPass(createAMDGPULowerIntrinsicsLegacyPass());
|
||||
// In accordance with the above FIXME, manually force all the
|
||||
@ -1408,11 +1404,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
|
||||
addPass(new DummyCGSCCPass());
|
||||
}
|
||||
|
||||
TargetPassConfig::addCodeGenPrepare();
|
||||
|
||||
if (isPassEnabled(EnableLoadStoreVectorizer))
|
||||
addPass(createLoadStoreVectorizerPass());
|
||||
|
||||
// LowerSwitch pass may introduce unreachable blocks that can
|
||||
// cause unexpected behavior for subsequent passes. Placing it
|
||||
// here seems better that these blocks would get cleaned up by
|
||||
@ -2125,6 +2116,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
|
||||
if (EnableLowerKernelArguments)
|
||||
addPass(AMDGPULowerKernelArgumentsPass(TM));
|
||||
|
||||
Base::addCodeGenPrepare(addPass);
|
||||
|
||||
if (isPassEnabled(EnableLoadStoreVectorizer))
|
||||
addPass(LoadStoreVectorizerPass());
|
||||
|
||||
// This lowering has been placed after codegenprepare to take advantage of
|
||||
// address mode matching (which is why it isn't put with the LDS lowerings).
|
||||
// It could be placed anywhere before uniformity annotations (an analysis
|
||||
@ -2132,25 +2128,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
|
||||
// but has been put before switch lowering and CFG flattening so that those
|
||||
// passes can run on the more optimized control flow this pass creates in
|
||||
// many cases.
|
||||
//
|
||||
// FIXME: This should ideally be put after the LoadStoreVectorizer.
|
||||
// However, due to some annoying facts about ResourceUsageAnalysis,
|
||||
// (especially as exercised in the resource-usage-dead-function test),
|
||||
// we need all the function passes codegenprepare all the way through
|
||||
// said resource usage analysis to run on the call graph produced
|
||||
// before codegenprepare runs (because codegenprepare will knock some
|
||||
// nodes out of the graph, which leads to function-level passes not
|
||||
// being run on them, which causes crashes in the resource usage analysis).
|
||||
addPass(AMDGPULowerBufferFatPointersPass(TM));
|
||||
addPass.requireCGSCCOrder();
|
||||
|
||||
addPass(AMDGPULowerIntrinsicsPass(TM));
|
||||
|
||||
Base::addCodeGenPrepare(addPass);
|
||||
|
||||
if (isPassEnabled(EnableLoadStoreVectorizer))
|
||||
addPass(LoadStoreVectorizerPass());
|
||||
|
||||
// LowerSwitch pass may introduce unreachable blocks that can cause unexpected
|
||||
// behavior for subsequent passes. Placing it here seems better that these
|
||||
// blocks would get cleaned up by UnreachableBlockElim inserted next in the
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -782,69 +782,90 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
|
||||
; SDAG-GFX942-LABEL: memcpy_known_medium:
|
||||
; SDAG-GFX942: ; %bb.0:
|
||||
; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x34
|
||||
; SDAG-GFX942-NEXT: s_load_dword s17, s[4:5], 0x34
|
||||
; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x44
|
||||
; SDAG-GFX942-NEXT: s_load_dword s14, s[4:5], 0x54
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s12, 0
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s5, s12
|
||||
; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-GFX942-NEXT: s_load_dword s12, s[4:5], 0x54
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s16, 0
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s5, s16
|
||||
; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s4, s3
|
||||
; SDAG-GFX942-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s13, s2
|
||||
; SDAG-GFX942-NEXT: s_or_b64 s[6:7], s[4:5], s[16:17]
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s17, s2
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s2, s1
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s3, s12
|
||||
; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s13, s14
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s3, s16
|
||||
; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[16:17]
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s17, s12
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s2, s11
|
||||
; SDAG-GFX942-NEXT: s_or_b64 s[14:15], s[2:3], s[12:13]
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s13, s10
|
||||
; SDAG-GFX942-NEXT: s_or_b64 s[14:15], s[2:3], s[16:17]
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s17, s10
|
||||
; SDAG-GFX942-NEXT: s_mov_b32 s2, s9
|
||||
; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[12:13]
|
||||
; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17]
|
||||
; SDAG-GFX942-NEXT: .LBB1_1: ; %load-store-loop
|
||||
; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; SDAG-GFX942-NEXT: v_add_u32_e32 v1, s0, v0
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[4:7], 0 offen
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v1, s[4:7], 0 offen offset:96
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v1, s[4:7], 0 offen offset:112
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v1, s[4:7], 0 offen offset:128
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v1, s[4:7], 0 offen offset:144
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v1, s[4:7], 0 offen offset:160
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v1, s[4:7], 0 offen offset:176
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
|
||||
; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
|
||||
; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
|
||||
; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
|
||||
; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16
|
||||
; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32
|
||||
; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16
|
||||
; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100
|
||||
; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x100
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
|
||||
; SDAG-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[12:15], 0 offen offset:96
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[12:15], 0 offen offset:112
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[12:15], 0 offen offset:128
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[12:15], 0 offen offset:144
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[12:15], 0 offen offset:160
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[12:15], 0 offen offset:176
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224
|
||||
; SDAG-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240
|
||||
; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
|
||||
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse
|
||||
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse
|
||||
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse
|
||||
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v12 ; Reload Reuse
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:48
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v60, s[4:7], 0 offen offset:64
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v60, s[4:7], 0 offen offset:80
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v60, s[4:7], 0 offen offset:96
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v60, s[4:7], 0 offen offset:112
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v60, s[4:7], 0 offen offset:128
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v60, s[4:7], 0 offen offset:144
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v60, s[4:7], 0 offen offset:160
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v60, s[4:7], 0 offen offset:176
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v60, s[4:7], 0 offen offset:192
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v60, s[4:7], 0 offen offset:208
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224
|
||||
; SDAG-GFX942-NEXT: s_nop 0
|
||||
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240
|
||||
; SDAG-GFX942-NEXT: s_nop 0
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16
|
||||
; SDAG-GFX942-NEXT: s_nop 1
|
||||
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
|
||||
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
|
||||
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
|
||||
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen offset:32
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v0, s[12:15], 0 offen offset:48
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v0, s[12:15], 0 offen offset:64
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v0, s[12:15], 0 offen offset:80
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v0, s[12:15], 0 offen offset:96
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v0, s[12:15], 0 offen offset:112
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v0, s[12:15], 0 offen offset:128
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v0, s[12:15], 0 offen offset:144
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v0, s[12:15], 0 offen offset:160
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v0, s[12:15], 0 offen offset:176
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v0, s[12:15], 0 offen offset:192
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v0, s[12:15], 0 offen offset:208
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v0, s[12:15], 0 offen offset:224
|
||||
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v0, s[12:15], 0 offen offset:240
|
||||
; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
|
||||
; SDAG-GFX942-NEXT: s_endpgm
|
||||
;
|
||||
@ -852,84 +873,87 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
|
||||
; SDAG-GFX1100: ; %bb.0:
|
||||
; SDAG-GFX1100-NEXT: s_clause 0x3
|
||||
; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x34
|
||||
; SDAG-GFX1100-NEXT: s_load_b32 s17, s[4:5], 0x34
|
||||
; SDAG-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44
|
||||
; SDAG-GFX1100-NEXT: s_load_b32 s18, s[4:5], 0x54
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s12, 0
|
||||
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s5, s12
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s15, s12
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s17, s12
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s16, 0
|
||||
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s5, s16
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s16
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s15, s16
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s14, s1
|
||||
; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s16, s11
|
||||
; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[14:15], s[12:13]
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s18
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s12, s1
|
||||
; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[16:17]
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s17, s2
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s14, s11
|
||||
; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[12:13], s[16:17]
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s17, s18
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s2, s9
|
||||
; SDAG-GFX1100-NEXT: s_or_b64 s[14:15], s[16:17], s[12:13]
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s10
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12
|
||||
; SDAG-GFX1100-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s17, s10
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s3, s16
|
||||
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; SDAG-GFX1100-NEXT: s_or_b64 s[12:13], s[2:3], s[12:13]
|
||||
; SDAG-GFX1100-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17]
|
||||
; SDAG-GFX1100-NEXT: .LBB1_1: ; %load-store-loop
|
||||
; SDAG-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; SDAG-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0
|
||||
; SDAG-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0
|
||||
; SDAG-GFX1100-NEXT: v_add_co_u32 v0, s1, 0x100, v0
|
||||
; SDAG-GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1
|
||||
; SDAG-GFX1100-NEXT: s_add_i32 s1, s0, s16
|
||||
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v60, s1
|
||||
; SDAG-GFX1100-NEXT: s_add_i32 s1, s8, s16
|
||||
; SDAG-GFX1100-NEXT: s_addk_i32 s16, 0x100
|
||||
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v64, s1
|
||||
; SDAG-GFX1100-NEXT: s_cmpk_lt_u32 s16, 0x100
|
||||
; SDAG-GFX1100-NEXT: s_clause 0xf
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[1:4], v61, s[4:7], 0 offen
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[5:8], v61, s[4:7], 0 offen offset:16
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[9:12], v61, s[4:7], 0 offen offset:32
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[13:16], v61, s[4:7], 0 offen offset:48
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[17:20], v61, s[4:7], 0 offen offset:64
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[21:24], v61, s[4:7], 0 offen offset:80
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[25:28], v61, s[4:7], 0 offen offset:96
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[29:32], v61, s[4:7], 0 offen offset:112
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[33:36], v61, s[4:7], 0 offen offset:128
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[37:40], v61, s[4:7], 0 offen offset:144
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[41:44], v61, s[4:7], 0 offen offset:160
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[45:48], v61, s[4:7], 0 offen offset:176
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[49:52], v61, s[4:7], 0 offen offset:192
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[53:56], v61, s[4:7], 0 offen offset:208
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[57:60], v61, s[4:7], 0 offen offset:224
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[61:64], v61, s[4:7], 0 offen offset:240
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v60, s[4:7], 0 offen
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[4:7], v60, s[4:7], 0 offen offset:16
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[8:11], v60, s[4:7], 0 offen offset:32
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[12:15], v60, s[4:7], 0 offen offset:48
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[16:19], v60, s[4:7], 0 offen offset:64
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[20:23], v60, s[4:7], 0 offen offset:80
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[24:27], v60, s[4:7], 0 offen offset:96
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[28:31], v60, s[4:7], 0 offen offset:112
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[32:35], v60, s[4:7], 0 offen offset:128
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[36:39], v60, s[4:7], 0 offen offset:144
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[40:43], v60, s[4:7], 0 offen offset:160
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[44:47], v60, s[4:7], 0 offen offset:176
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[48:51], v60, s[4:7], 0 offen offset:192
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[52:55], v60, s[4:7], 0 offen offset:208
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[56:59], v60, s[4:7], 0 offen offset:224
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[60:63], v60, s[4:7], 0 offen offset:240
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(15)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[1:4], v65, s[12:15], 0 offen
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[0:3], v64, s[12:15], 0 offen
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(14)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[5:8], v65, s[12:15], 0 offen offset:16
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[4:7], v64, s[12:15], 0 offen offset:16
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(13)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[9:12], v65, s[12:15], 0 offen offset:32
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[8:11], v64, s[12:15], 0 offen offset:32
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(12)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[13:16], v65, s[12:15], 0 offen offset:48
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[12:15], v64, s[12:15], 0 offen offset:48
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(11)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[17:20], v65, s[12:15], 0 offen offset:64
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[16:19], v64, s[12:15], 0 offen offset:64
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(10)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[21:24], v65, s[12:15], 0 offen offset:80
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[20:23], v64, s[12:15], 0 offen offset:80
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(9)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[25:28], v65, s[12:15], 0 offen offset:96
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[24:27], v64, s[12:15], 0 offen offset:96
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(8)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[29:32], v65, s[12:15], 0 offen offset:112
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[28:31], v64, s[12:15], 0 offen offset:112
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(7)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[33:36], v65, s[12:15], 0 offen offset:128
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[32:35], v64, s[12:15], 0 offen offset:128
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(6)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[37:40], v65, s[12:15], 0 offen offset:144
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[36:39], v64, s[12:15], 0 offen offset:144
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(5)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[41:44], v65, s[12:15], 0 offen offset:160
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[40:43], v64, s[12:15], 0 offen offset:160
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(4)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[45:48], v65, s[12:15], 0 offen offset:176
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[44:47], v64, s[12:15], 0 offen offset:176
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(3)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[49:52], v65, s[12:15], 0 offen offset:192
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[48:51], v64, s[12:15], 0 offen offset:192
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(2)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[53:56], v65, s[12:15], 0 offen offset:208
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[52:55], v64, s[12:15], 0 offen offset:208
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(1)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[57:60], v65, s[12:15], 0 offen offset:224
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[56:59], v64, s[12:15], 0 offen offset:224
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240
|
||||
; SDAG-GFX1100-NEXT: s_cbranch_vccnz .LBB1_1
|
||||
; SDAG-GFX1100-NEXT: buffer_store_b128 v[60:63], v64, s[12:15], 0 offen offset:240
|
||||
; SDAG-GFX1100-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; SDAG-GFX1100-NEXT: ; %bb.2: ; %memcpy-split
|
||||
; SDAG-GFX1100-NEXT: s_endpgm
|
||||
;
|
||||
@ -957,52 +981,50 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
|
||||
; GISEL-GFX942-NEXT: s_mov_b32 s2, s7
|
||||
; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3]
|
||||
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s16
|
||||
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x100
|
||||
; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16
|
||||
; GISEL-GFX942-NEXT: .LBB1_1: ; %load-store-loop
|
||||
; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, s0, v0
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v1, s[8:11], 0 offen offset:96
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v1, s[8:11], 0 offen offset:112
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v1, s[8:11], 0 offen offset:128
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v1, s[8:11], 0 offen offset:144
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v1, s[8:11], 0 offen offset:160
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v1, s[8:11], 0 offen offset:176
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240
|
||||
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
|
||||
; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
|
||||
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
|
||||
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
|
||||
; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
|
||||
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v62, s[8:11], 0 offen offset:96
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v62, s[8:11], 0 offen offset:112
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v62, s[8:11], 0 offen offset:128
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v62, s[8:11], 0 offen offset:144
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v62, s[8:11], 0 offen offset:160
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v62, s[8:11], 0 offen offset:176
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
|
||||
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
|
||||
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
|
||||
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
|
||||
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
|
||||
; GISEL-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[4:7], 0 offen offset:96
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[4:7], 0 offen offset:112
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[4:7], 0 offen offset:128
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[4:7], 0 offen offset:144
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[4:7], 0 offen offset:160
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[4:7], 0 offen offset:176
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
|
||||
; GISEL-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
|
||||
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
|
||||
; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
|
||||
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
|
||||
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
|
||||
; GISEL-GFX942-NEXT: s_endpgm
|
||||
@ -1037,8 +1059,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
|
||||
; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0
|
||||
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0
|
||||
; GISEL-GFX1100-NEXT: v_add_co_u32 v0, s1, 0x100, v0
|
||||
; GISEL-GFX1100-NEXT: s_xor_b32 s1, s1, -1
|
||||
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
|
||||
; GISEL-GFX1100-NEXT: s_clause 0xf
|
||||
; GISEL-GFX1100-NEXT: buffer_load_b128 v[1:4], v61, s[4:7], 0 offen
|
||||
; GISEL-GFX1100-NEXT: buffer_load_b128 v[5:8], v61, s[4:7], 0 offen offset:16
|
||||
@ -1056,7 +1077,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
|
||||
; GISEL-GFX1100-NEXT: buffer_load_b128 v[53:56], v61, s[4:7], 0 offen offset:208
|
||||
; GISEL-GFX1100-NEXT: buffer_load_b128 v[57:60], v61, s[4:7], 0 offen offset:224
|
||||
; GISEL-GFX1100-NEXT: buffer_load_b128 v[61:64], v61, s[4:7], 0 offen offset:240
|
||||
; GISEL-GFX1100-NEXT: s_xor_b32 s1, s1, -1
|
||||
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX1100-NEXT: buffer_store_b128 v[1:4], v65, s[12:15], 0 offen
|
||||
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(14)
|
||||
@ -1089,7 +1109,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
|
||||
; GISEL-GFX1100-NEXT: buffer_store_b128 v[57:60], v65, s[12:15], 0 offen offset:224
|
||||
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-GFX1100-NEXT: buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240
|
||||
; GISEL-GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1
|
||||
; GISEL-GFX1100-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x100, v0
|
||||
; GISEL-GFX1100-NEXT: s_cbranch_vccnz .LBB1_1
|
||||
; GISEL-GFX1100-NEXT: ; %bb.2: ; %memcpy-split
|
||||
; GISEL-GFX1100-NEXT: s_endpgm
|
||||
|
||||
@ -11,9 +11,9 @@
|
||||
|
||||
; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
|
||||
|
||||
; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
|
||||
; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
|
||||
|
||||
; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
|
||||
; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
|
||||
|
||||
define void @empty() {
|
||||
ret void
|
||||
|
||||
@ -232,15 +232,15 @@
|
||||
; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments
|
||||
; GCN-O1-NEXT: FunctionPass Manager
|
||||
; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments
|
||||
; GCN-O1-NEXT: Dominator Tree Construction
|
||||
; GCN-O1-NEXT: Natural Loop Information
|
||||
; GCN-O1-NEXT: CodeGen Prepare
|
||||
; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources
|
||||
; GCN-O1-NEXT: AMDGPU lower intrinsics
|
||||
; GCN-O1-NEXT: CallGraph Construction
|
||||
; GCN-O1-NEXT: Call Graph SCC Pass Manager
|
||||
; GCN-O1-NEXT: DummyCGSCCPass
|
||||
; GCN-O1-NEXT: FunctionPass Manager
|
||||
; GCN-O1-NEXT: Dominator Tree Construction
|
||||
; GCN-O1-NEXT: Natural Loop Information
|
||||
; GCN-O1-NEXT: CodeGen Prepare
|
||||
; GCN-O1-NEXT: Lazy Value Information Analysis
|
||||
; GCN-O1-NEXT: Lower SwitchInst's to branches
|
||||
; GCN-O1-NEXT: Lower invoke and unwind, for unwindless code generators
|
||||
@ -533,21 +533,21 @@
|
||||
; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments
|
||||
; GCN-O1-OPTS-NEXT: FunctionPass Manager
|
||||
; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments
|
||||
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
|
||||
; GCN-O1-OPTS-NEXT: Natural Loop Information
|
||||
; GCN-O1-OPTS-NEXT: CodeGen Prepare
|
||||
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
|
||||
; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)
|
||||
; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
|
||||
; GCN-O1-OPTS-NEXT: Natural Loop Information
|
||||
; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis
|
||||
; GCN-O1-OPTS-NEXT: GPU Load and Store Vectorizer
|
||||
; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources
|
||||
; GCN-O1-OPTS-NEXT: AMDGPU lower intrinsics
|
||||
; GCN-O1-OPTS-NEXT: CallGraph Construction
|
||||
; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager
|
||||
; GCN-O1-OPTS-NEXT: DummyCGSCCPass
|
||||
; GCN-O1-OPTS-NEXT: FunctionPass Manager
|
||||
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
|
||||
; GCN-O1-OPTS-NEXT: Natural Loop Information
|
||||
; GCN-O1-OPTS-NEXT: CodeGen Prepare
|
||||
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
|
||||
; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)
|
||||
; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
|
||||
; GCN-O1-OPTS-NEXT: Natural Loop Information
|
||||
; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis
|
||||
; GCN-O1-OPTS-NEXT: GPU Load and Store Vectorizer
|
||||
; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis
|
||||
; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches
|
||||
; GCN-O1-OPTS-NEXT: Lower invoke and unwind, for unwindless code generators
|
||||
@ -852,21 +852,21 @@
|
||||
; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments
|
||||
; GCN-O2-NEXT: FunctionPass Manager
|
||||
; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments
|
||||
; GCN-O2-NEXT: Dominator Tree Construction
|
||||
; GCN-O2-NEXT: Natural Loop Information
|
||||
; GCN-O2-NEXT: CodeGen Prepare
|
||||
; GCN-O2-NEXT: Dominator Tree Construction
|
||||
; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
|
||||
; GCN-O2-NEXT: Function Alias Analysis Results
|
||||
; GCN-O2-NEXT: Natural Loop Information
|
||||
; GCN-O2-NEXT: Scalar Evolution Analysis
|
||||
; GCN-O2-NEXT: GPU Load and Store Vectorizer
|
||||
; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources
|
||||
; GCN-O2-NEXT: AMDGPU lower intrinsics
|
||||
; GCN-O2-NEXT: CallGraph Construction
|
||||
; GCN-O2-NEXT: Call Graph SCC Pass Manager
|
||||
; GCN-O2-NEXT: DummyCGSCCPass
|
||||
; GCN-O2-NEXT: FunctionPass Manager
|
||||
; GCN-O2-NEXT: Dominator Tree Construction
|
||||
; GCN-O2-NEXT: Natural Loop Information
|
||||
; GCN-O2-NEXT: CodeGen Prepare
|
||||
; GCN-O2-NEXT: Dominator Tree Construction
|
||||
; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
|
||||
; GCN-O2-NEXT: Function Alias Analysis Results
|
||||
; GCN-O2-NEXT: Natural Loop Information
|
||||
; GCN-O2-NEXT: Scalar Evolution Analysis
|
||||
; GCN-O2-NEXT: GPU Load and Store Vectorizer
|
||||
; GCN-O2-NEXT: Lazy Value Information Analysis
|
||||
; GCN-O2-NEXT: Lower SwitchInst's to branches
|
||||
; GCN-O2-NEXT: Lower invoke and unwind, for unwindless code generators
|
||||
@ -1186,21 +1186,21 @@
|
||||
; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments
|
||||
; GCN-O3-NEXT: FunctionPass Manager
|
||||
; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments
|
||||
; GCN-O3-NEXT: Dominator Tree Construction
|
||||
; GCN-O3-NEXT: Natural Loop Information
|
||||
; GCN-O3-NEXT: CodeGen Prepare
|
||||
; GCN-O3-NEXT: Dominator Tree Construction
|
||||
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
|
||||
; GCN-O3-NEXT: Function Alias Analysis Results
|
||||
; GCN-O3-NEXT: Natural Loop Information
|
||||
; GCN-O3-NEXT: Scalar Evolution Analysis
|
||||
; GCN-O3-NEXT: GPU Load and Store Vectorizer
|
||||
; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources
|
||||
; GCN-O3-NEXT: AMDGPU lower intrinsics
|
||||
; GCN-O3-NEXT: CallGraph Construction
|
||||
; GCN-O3-NEXT: Call Graph SCC Pass Manager
|
||||
; GCN-O3-NEXT: DummyCGSCCPass
|
||||
; GCN-O3-NEXT: FunctionPass Manager
|
||||
; GCN-O3-NEXT: Dominator Tree Construction
|
||||
; GCN-O3-NEXT: Natural Loop Information
|
||||
; GCN-O3-NEXT: CodeGen Prepare
|
||||
; GCN-O3-NEXT: Dominator Tree Construction
|
||||
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
|
||||
; GCN-O3-NEXT: Function Alias Analysis Results
|
||||
; GCN-O3-NEXT: Natural Loop Information
|
||||
; GCN-O3-NEXT: Scalar Evolution Analysis
|
||||
; GCN-O3-NEXT: GPU Load and Store Vectorizer
|
||||
; GCN-O3-NEXT: Lazy Value Information Analysis
|
||||
; GCN-O3-NEXT: Lower SwitchInst's to branches
|
||||
; GCN-O3-NEXT: Lower invoke and unwind, for unwindless code generators
|
||||
|
||||
@ -7,16 +7,12 @@
|
||||
|
||||
@gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
|
||||
|
||||
; GCN-LABEL: unreachable:
|
||||
; Function info:
|
||||
; codeLenInByte = 4
|
||||
define internal fastcc void @unreachable() {
|
||||
%fptr = load ptr, ptr addrspace(4) @gv.fptr0
|
||||
call void %fptr()
|
||||
unreachable
|
||||
}
|
||||
|
||||
|
||||
; GCN-LABEL: entry:
|
||||
; GCN-NOT: s_swappc_b64
|
||||
; GCN: s_endpgm
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user