[AMDGPU] Move LowerBufferFatPointers after LoadStoreVectorizer and remove the fixme (#161531)

Move LowerBufferFatPointers pass after CodegenPrepare and
LoadStoreVectorizer pass, and remove the fixme about that.
This commit is contained in:
Gang Chen 2025-10-01 17:52:15 -07:00 committed by GitHub
parent bdea159093
commit 640644d68a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 1825 additions and 2332 deletions

View File

@ -1384,6 +1384,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
TargetPassConfig::addCodeGenPrepare();
if (isPassEnabled(EnableLoadStoreVectorizer))
addPass(createLoadStoreVectorizerPass());
if (TM->getTargetTriple().isAMDGCN()) {
// This lowering has been placed after codegenprepare to take advantage of
// address mode matching (which is why it isn't put with the LDS lowerings).
@ -1392,15 +1397,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
// but has been put before switch lowering and CFG flattening so that those
// passes can run on the more optimized control flow this pass creates in
// many cases.
//
// FIXME: This should ideally be put after the LoadStoreVectorizer.
// However, due to some annoying facts about ResourceUsageAnalysis,
// (especially as exercised in the resource-usage-dead-function test),
// we need all the function passes codegenprepare all the way through
// said resource usage analysis to run on the call graph produced
// before codegenprepare runs (because codegenprepare will knock some
// nodes out of the graph, which leads to function-level passes not
// being run on them, which causes crashes in the resource usage analysis).
addPass(createAMDGPULowerBufferFatPointersPass());
addPass(createAMDGPULowerIntrinsicsLegacyPass());
// In accordance with the above FIXME, manually force all the
@ -1408,11 +1404,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
addPass(new DummyCGSCCPass());
}
TargetPassConfig::addCodeGenPrepare();
if (isPassEnabled(EnableLoadStoreVectorizer))
addPass(createLoadStoreVectorizerPass());
// LowerSwitch pass may introduce unreachable blocks that can
// cause unexpected behavior for subsequent passes. Placing it
// here seems better that these blocks would get cleaned up by
@ -2125,6 +2116,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
if (EnableLowerKernelArguments)
addPass(AMDGPULowerKernelArgumentsPass(TM));
Base::addCodeGenPrepare(addPass);
if (isPassEnabled(EnableLoadStoreVectorizer))
addPass(LoadStoreVectorizerPass());
// This lowering has been placed after codegenprepare to take advantage of
// address mode matching (which is why it isn't put with the LDS lowerings).
// It could be placed anywhere before uniformity annotations (an analysis
@ -2132,25 +2128,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
// but has been put before switch lowering and CFG flattening so that those
// passes can run on the more optimized control flow this pass creates in
// many cases.
//
// FIXME: This should ideally be put after the LoadStoreVectorizer.
// However, due to some annoying facts about ResourceUsageAnalysis,
// (especially as exercised in the resource-usage-dead-function test),
// we need all the function passes codegenprepare all the way through
// said resource usage analysis to run on the call graph produced
// before codegenprepare runs (because codegenprepare will knock some
// nodes out of the graph, which leads to function-level passes not
// being run on them, which causes crashes in the resource usage analysis).
addPass(AMDGPULowerBufferFatPointersPass(TM));
addPass.requireCGSCCOrder();
addPass(AMDGPULowerIntrinsicsPass(TM));
Base::addCodeGenPrepare(addPass);
if (isPassEnabled(EnableLoadStoreVectorizer))
addPass(LoadStoreVectorizerPass());
// LowerSwitch pass may introduce unreachable blocks that can cause unexpected
// behavior for subsequent passes. Placing it here seems better that these
// blocks would get cleaned up by UnreachableBlockElim inserted next in the

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -782,69 +782,90 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; SDAG-GFX942-LABEL: memcpy_known_medium:
; SDAG-GFX942: ; %bb.0:
; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x34
; SDAG-GFX942-NEXT: s_load_dword s17, s[4:5], 0x34
; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x44
; SDAG-GFX942-NEXT: s_load_dword s14, s[4:5], 0x54
; SDAG-GFX942-NEXT: s_mov_b32 s12, 0
; SDAG-GFX942-NEXT: s_mov_b32 s5, s12
; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX942-NEXT: s_load_dword s12, s[4:5], 0x54
; SDAG-GFX942-NEXT: s_mov_b32 s16, 0
; SDAG-GFX942-NEXT: s_mov_b32 s5, s16
; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX942-NEXT: s_mov_b32 s4, s3
; SDAG-GFX942-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; SDAG-GFX942-NEXT: s_mov_b32 s13, s2
; SDAG-GFX942-NEXT: s_or_b64 s[6:7], s[4:5], s[16:17]
; SDAG-GFX942-NEXT: s_mov_b32 s17, s2
; SDAG-GFX942-NEXT: s_mov_b32 s2, s1
; SDAG-GFX942-NEXT: s_mov_b32 s3, s12
; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; SDAG-GFX942-NEXT: s_mov_b32 s13, s14
; SDAG-GFX942-NEXT: s_mov_b32 s3, s16
; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[16:17]
; SDAG-GFX942-NEXT: s_mov_b32 s17, s12
; SDAG-GFX942-NEXT: s_mov_b32 s2, s11
; SDAG-GFX942-NEXT: s_or_b64 s[14:15], s[2:3], s[12:13]
; SDAG-GFX942-NEXT: s_mov_b32 s13, s10
; SDAG-GFX942-NEXT: s_or_b64 s[14:15], s[2:3], s[16:17]
; SDAG-GFX942-NEXT: s_mov_b32 s17, s10
; SDAG-GFX942-NEXT: s_mov_b32 s2, s9
; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[12:13]
; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17]
; SDAG-GFX942-NEXT: .LBB1_1: ; %load-store-loop
; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-GFX942-NEXT: v_add_u32_e32 v1, s0, v0
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[4:7], 0 offen
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v1, s[4:7], 0 offen offset:96
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v1, s[4:7], 0 offen offset:112
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v1, s[4:7], 0 offen offset:128
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v1, s[4:7], 0 offen offset:144
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v1, s[4:7], 0 offen offset:160
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v1, s[4:7], 0 offen offset:176
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16
; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32
; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16
; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2
; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100
; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x100
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
; SDAG-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[12:15], 0 offen offset:96
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[12:15], 0 offen offset:112
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[12:15], 0 offen offset:128
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[12:15], 0 offen offset:144
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[12:15], 0 offen offset:160
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[12:15], 0 offen offset:176
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224
; SDAG-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240
; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v12 ; Reload Reuse
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:48
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v60, s[4:7], 0 offen offset:64
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v60, s[4:7], 0 offen offset:80
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v60, s[4:7], 0 offen offset:96
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v60, s[4:7], 0 offen offset:112
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v60, s[4:7], 0 offen offset:128
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v60, s[4:7], 0 offen offset:144
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v60, s[4:7], 0 offen offset:160
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v60, s[4:7], 0 offen offset:176
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v60, s[4:7], 0 offen offset:192
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v60, s[4:7], 0 offen offset:208
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224
; SDAG-GFX942-NEXT: s_nop 0
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240
; SDAG-GFX942-NEXT: s_nop 0
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16
; SDAG-GFX942-NEXT: s_nop 1
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen offset:32
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v0, s[12:15], 0 offen offset:48
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v0, s[12:15], 0 offen offset:64
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v0, s[12:15], 0 offen offset:80
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v0, s[12:15], 0 offen offset:96
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v0, s[12:15], 0 offen offset:112
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v0, s[12:15], 0 offen offset:128
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v0, s[12:15], 0 offen offset:144
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v0, s[12:15], 0 offen offset:160
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v0, s[12:15], 0 offen offset:176
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v0, s[12:15], 0 offen offset:192
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v0, s[12:15], 0 offen offset:208
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v0, s[12:15], 0 offen offset:224
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v0, s[12:15], 0 offen offset:240
; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB1_1
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; SDAG-GFX942-NEXT: s_endpgm
;
@ -852,84 +873,87 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; SDAG-GFX1100: ; %bb.0:
; SDAG-GFX1100-NEXT: s_clause 0x3
; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x34
; SDAG-GFX1100-NEXT: s_load_b32 s17, s[4:5], 0x34
; SDAG-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44
; SDAG-GFX1100-NEXT: s_load_b32 s18, s[4:5], 0x54
; SDAG-GFX1100-NEXT: s_mov_b32 s12, 0
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX1100-NEXT: s_mov_b32 s5, s12
; SDAG-GFX1100-NEXT: s_mov_b32 s15, s12
; SDAG-GFX1100-NEXT: s_mov_b32 s17, s12
; SDAG-GFX1100-NEXT: s_mov_b32 s16, 0
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; SDAG-GFX1100-NEXT: s_mov_b32 s5, s16
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s16
; SDAG-GFX1100-NEXT: s_mov_b32 s15, s16
; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3
; SDAG-GFX1100-NEXT: s_mov_b32 s14, s1
; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2
; SDAG-GFX1100-NEXT: s_mov_b32 s16, s11
; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[14:15], s[12:13]
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s18
; SDAG-GFX1100-NEXT: s_mov_b32 s12, s1
; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[16:17]
; SDAG-GFX1100-NEXT: s_mov_b32 s17, s2
; SDAG-GFX1100-NEXT: s_mov_b32 s14, s11
; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[12:13], s[16:17]
; SDAG-GFX1100-NEXT: s_mov_b32 s17, s18
; SDAG-GFX1100-NEXT: s_mov_b32 s2, s9
; SDAG-GFX1100-NEXT: s_or_b64 s[14:15], s[16:17], s[12:13]
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s10
; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12
; SDAG-GFX1100-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
; SDAG-GFX1100-NEXT: s_mov_b32 s17, s10
; SDAG-GFX1100-NEXT: s_mov_b32 s3, s16
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; SDAG-GFX1100-NEXT: s_or_b64 s[12:13], s[2:3], s[12:13]
; SDAG-GFX1100-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17]
; SDAG-GFX1100-NEXT: .LBB1_1: ; %load-store-loop
; SDAG-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0
; SDAG-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0
; SDAG-GFX1100-NEXT: v_add_co_u32 v0, s1, 0x100, v0
; SDAG-GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1
; SDAG-GFX1100-NEXT: s_add_i32 s1, s0, s16
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v60, s1
; SDAG-GFX1100-NEXT: s_add_i32 s1, s8, s16
; SDAG-GFX1100-NEXT: s_addk_i32 s16, 0x100
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v64, s1
; SDAG-GFX1100-NEXT: s_cmpk_lt_u32 s16, 0x100
; SDAG-GFX1100-NEXT: s_clause 0xf
; SDAG-GFX1100-NEXT: buffer_load_b128 v[1:4], v61, s[4:7], 0 offen
; SDAG-GFX1100-NEXT: buffer_load_b128 v[5:8], v61, s[4:7], 0 offen offset:16
; SDAG-GFX1100-NEXT: buffer_load_b128 v[9:12], v61, s[4:7], 0 offen offset:32
; SDAG-GFX1100-NEXT: buffer_load_b128 v[13:16], v61, s[4:7], 0 offen offset:48
; SDAG-GFX1100-NEXT: buffer_load_b128 v[17:20], v61, s[4:7], 0 offen offset:64
; SDAG-GFX1100-NEXT: buffer_load_b128 v[21:24], v61, s[4:7], 0 offen offset:80
; SDAG-GFX1100-NEXT: buffer_load_b128 v[25:28], v61, s[4:7], 0 offen offset:96
; SDAG-GFX1100-NEXT: buffer_load_b128 v[29:32], v61, s[4:7], 0 offen offset:112
; SDAG-GFX1100-NEXT: buffer_load_b128 v[33:36], v61, s[4:7], 0 offen offset:128
; SDAG-GFX1100-NEXT: buffer_load_b128 v[37:40], v61, s[4:7], 0 offen offset:144
; SDAG-GFX1100-NEXT: buffer_load_b128 v[41:44], v61, s[4:7], 0 offen offset:160
; SDAG-GFX1100-NEXT: buffer_load_b128 v[45:48], v61, s[4:7], 0 offen offset:176
; SDAG-GFX1100-NEXT: buffer_load_b128 v[49:52], v61, s[4:7], 0 offen offset:192
; SDAG-GFX1100-NEXT: buffer_load_b128 v[53:56], v61, s[4:7], 0 offen offset:208
; SDAG-GFX1100-NEXT: buffer_load_b128 v[57:60], v61, s[4:7], 0 offen offset:224
; SDAG-GFX1100-NEXT: buffer_load_b128 v[61:64], v61, s[4:7], 0 offen offset:240
; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v60, s[4:7], 0 offen
; SDAG-GFX1100-NEXT: buffer_load_b128 v[4:7], v60, s[4:7], 0 offen offset:16
; SDAG-GFX1100-NEXT: buffer_load_b128 v[8:11], v60, s[4:7], 0 offen offset:32
; SDAG-GFX1100-NEXT: buffer_load_b128 v[12:15], v60, s[4:7], 0 offen offset:48
; SDAG-GFX1100-NEXT: buffer_load_b128 v[16:19], v60, s[4:7], 0 offen offset:64
; SDAG-GFX1100-NEXT: buffer_load_b128 v[20:23], v60, s[4:7], 0 offen offset:80
; SDAG-GFX1100-NEXT: buffer_load_b128 v[24:27], v60, s[4:7], 0 offen offset:96
; SDAG-GFX1100-NEXT: buffer_load_b128 v[28:31], v60, s[4:7], 0 offen offset:112
; SDAG-GFX1100-NEXT: buffer_load_b128 v[32:35], v60, s[4:7], 0 offen offset:128
; SDAG-GFX1100-NEXT: buffer_load_b128 v[36:39], v60, s[4:7], 0 offen offset:144
; SDAG-GFX1100-NEXT: buffer_load_b128 v[40:43], v60, s[4:7], 0 offen offset:160
; SDAG-GFX1100-NEXT: buffer_load_b128 v[44:47], v60, s[4:7], 0 offen offset:176
; SDAG-GFX1100-NEXT: buffer_load_b128 v[48:51], v60, s[4:7], 0 offen offset:192
; SDAG-GFX1100-NEXT: buffer_load_b128 v[52:55], v60, s[4:7], 0 offen offset:208
; SDAG-GFX1100-NEXT: buffer_load_b128 v[56:59], v60, s[4:7], 0 offen offset:224
; SDAG-GFX1100-NEXT: buffer_load_b128 v[60:63], v60, s[4:7], 0 offen offset:240
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(15)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[1:4], v65, s[12:15], 0 offen
; SDAG-GFX1100-NEXT: buffer_store_b128 v[0:3], v64, s[12:15], 0 offen
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(14)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[5:8], v65, s[12:15], 0 offen offset:16
; SDAG-GFX1100-NEXT: buffer_store_b128 v[4:7], v64, s[12:15], 0 offen offset:16
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(13)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[9:12], v65, s[12:15], 0 offen offset:32
; SDAG-GFX1100-NEXT: buffer_store_b128 v[8:11], v64, s[12:15], 0 offen offset:32
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(12)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[13:16], v65, s[12:15], 0 offen offset:48
; SDAG-GFX1100-NEXT: buffer_store_b128 v[12:15], v64, s[12:15], 0 offen offset:48
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(11)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[17:20], v65, s[12:15], 0 offen offset:64
; SDAG-GFX1100-NEXT: buffer_store_b128 v[16:19], v64, s[12:15], 0 offen offset:64
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(10)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[21:24], v65, s[12:15], 0 offen offset:80
; SDAG-GFX1100-NEXT: buffer_store_b128 v[20:23], v64, s[12:15], 0 offen offset:80
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(9)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[25:28], v65, s[12:15], 0 offen offset:96
; SDAG-GFX1100-NEXT: buffer_store_b128 v[24:27], v64, s[12:15], 0 offen offset:96
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(8)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[29:32], v65, s[12:15], 0 offen offset:112
; SDAG-GFX1100-NEXT: buffer_store_b128 v[28:31], v64, s[12:15], 0 offen offset:112
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(7)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[33:36], v65, s[12:15], 0 offen offset:128
; SDAG-GFX1100-NEXT: buffer_store_b128 v[32:35], v64, s[12:15], 0 offen offset:128
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(6)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[37:40], v65, s[12:15], 0 offen offset:144
; SDAG-GFX1100-NEXT: buffer_store_b128 v[36:39], v64, s[12:15], 0 offen offset:144
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(5)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[41:44], v65, s[12:15], 0 offen offset:160
; SDAG-GFX1100-NEXT: buffer_store_b128 v[40:43], v64, s[12:15], 0 offen offset:160
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(4)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[45:48], v65, s[12:15], 0 offen offset:176
; SDAG-GFX1100-NEXT: buffer_store_b128 v[44:47], v64, s[12:15], 0 offen offset:176
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(3)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[49:52], v65, s[12:15], 0 offen offset:192
; SDAG-GFX1100-NEXT: buffer_store_b128 v[48:51], v64, s[12:15], 0 offen offset:192
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(2)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[53:56], v65, s[12:15], 0 offen offset:208
; SDAG-GFX1100-NEXT: buffer_store_b128 v[52:55], v64, s[12:15], 0 offen offset:208
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(1)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[57:60], v65, s[12:15], 0 offen offset:224
; SDAG-GFX1100-NEXT: buffer_store_b128 v[56:59], v64, s[12:15], 0 offen offset:224
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0)
; SDAG-GFX1100-NEXT: buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240
; SDAG-GFX1100-NEXT: s_cbranch_vccnz .LBB1_1
; SDAG-GFX1100-NEXT: buffer_store_b128 v[60:63], v64, s[12:15], 0 offen offset:240
; SDAG-GFX1100-NEXT: s_cbranch_scc1 .LBB1_1
; SDAG-GFX1100-NEXT: ; %bb.2: ; %memcpy-split
; SDAG-GFX1100-NEXT: s_endpgm
;
@ -957,52 +981,50 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX942-NEXT: s_mov_b32 s2, s7
; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3]
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s16
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x100
; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16
; GISEL-GFX942-NEXT: .LBB1_1: ; %load-store-loop
; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, s0, v0
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v1, s[8:11], 0 offen offset:96
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v1, s[8:11], 0 offen offset:112
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v1, s[8:11], 0 offen offset:128
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v1, s[8:11], 0 offen offset:144
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v1, s[8:11], 0 offen offset:160
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v1, s[8:11], 0 offen offset:176
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v62, s[8:11], 0 offen offset:96
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v62, s[8:11], 0 offen offset:112
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v62, s[8:11], 0 offen offset:128
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v62, s[8:11], 0 offen offset:144
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v62, s[8:11], 0 offen offset:160
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v62, s[8:11], 0 offen offset:176
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
; GISEL-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[4:7], 0 offen offset:96
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[4:7], 0 offen offset:112
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[4:7], 0 offen offset:128
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[4:7], 0 offen offset:144
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[4:7], 0 offen offset:160
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[4:7], 0 offen offset:176
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
; GISEL-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; GISEL-GFX942-NEXT: s_endpgm
@ -1037,8 +1059,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0
; GISEL-GFX1100-NEXT: v_add_co_u32 v0, s1, 0x100, v0
; GISEL-GFX1100-NEXT: s_xor_b32 s1, s1, -1
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
; GISEL-GFX1100-NEXT: s_clause 0xf
; GISEL-GFX1100-NEXT: buffer_load_b128 v[1:4], v61, s[4:7], 0 offen
; GISEL-GFX1100-NEXT: buffer_load_b128 v[5:8], v61, s[4:7], 0 offen offset:16
@ -1056,7 +1077,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX1100-NEXT: buffer_load_b128 v[53:56], v61, s[4:7], 0 offen offset:208
; GISEL-GFX1100-NEXT: buffer_load_b128 v[57:60], v61, s[4:7], 0 offen offset:224
; GISEL-GFX1100-NEXT: buffer_load_b128 v[61:64], v61, s[4:7], 0 offen offset:240
; GISEL-GFX1100-NEXT: s_xor_b32 s1, s1, -1
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX1100-NEXT: buffer_store_b128 v[1:4], v65, s[12:15], 0 offen
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(14)
@ -1089,7 +1109,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX1100-NEXT: buffer_store_b128 v[57:60], v65, s[12:15], 0 offen offset:224
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX1100-NEXT: buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240
; GISEL-GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1
; GISEL-GFX1100-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x100, v0
; GISEL-GFX1100-NEXT: s_cbranch_vccnz .LBB1_1
; GISEL-GFX1100-NEXT: ; %bb.2: ; %memcpy-split
; GISEL-GFX1100-NEXT: s_endpgm

View File

@ -11,9 +11,9 @@
; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
define void @empty() {
ret void

View File

@ -232,15 +232,15 @@
; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: CodeGen Prepare
; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O1-NEXT: AMDGPU lower intrinsics
; GCN-O1-NEXT: CallGraph Construction
; GCN-O1-NEXT: Call Graph SCC Pass Manager
; GCN-O1-NEXT: DummyCGSCCPass
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: CodeGen Prepare
; GCN-O1-NEXT: Lazy Value Information Analysis
; GCN-O1-NEXT: Lower SwitchInst's to branches
; GCN-O1-NEXT: Lower invoke and unwind, for unwindless code generators
@ -533,21 +533,21 @@
; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: CodeGen Prepare
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis
; GCN-O1-OPTS-NEXT: GPU Load and Store Vectorizer
; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O1-OPTS-NEXT: AMDGPU lower intrinsics
; GCN-O1-OPTS-NEXT: CallGraph Construction
; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager
; GCN-O1-OPTS-NEXT: DummyCGSCCPass
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: CodeGen Prepare
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis
; GCN-O1-OPTS-NEXT: GPU Load and Store Vectorizer
; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis
; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches
; GCN-O1-OPTS-NEXT: Lower invoke and unwind, for unwindless code generators
@ -852,21 +852,21 @@
; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: CodeGen Prepare
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O2-NEXT: Function Alias Analysis Results
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: Scalar Evolution Analysis
; GCN-O2-NEXT: GPU Load and Store Vectorizer
; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O2-NEXT: AMDGPU lower intrinsics
; GCN-O2-NEXT: CallGraph Construction
; GCN-O2-NEXT: Call Graph SCC Pass Manager
; GCN-O2-NEXT: DummyCGSCCPass
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: CodeGen Prepare
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O2-NEXT: Function Alias Analysis Results
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: Scalar Evolution Analysis
; GCN-O2-NEXT: GPU Load and Store Vectorizer
; GCN-O2-NEXT: Lazy Value Information Analysis
; GCN-O2-NEXT: Lower SwitchInst's to branches
; GCN-O2-NEXT: Lower invoke and unwind, for unwindless code generators
@ -1186,21 +1186,21 @@
; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: CodeGen Prepare
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O3-NEXT: Function Alias Analysis Results
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: Scalar Evolution Analysis
; GCN-O3-NEXT: GPU Load and Store Vectorizer
; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O3-NEXT: AMDGPU lower intrinsics
; GCN-O3-NEXT: CallGraph Construction
; GCN-O3-NEXT: Call Graph SCC Pass Manager
; GCN-O3-NEXT: DummyCGSCCPass
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: CodeGen Prepare
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O3-NEXT: Function Alias Analysis Results
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: Scalar Evolution Analysis
; GCN-O3-NEXT: GPU Load and Store Vectorizer
; GCN-O3-NEXT: Lazy Value Information Analysis
; GCN-O3-NEXT: Lower SwitchInst's to branches
; GCN-O3-NEXT: Lower invoke and unwind, for unwindless code generators

View File

@ -7,16 +7,12 @@
@gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
; GCN-LABEL: unreachable:
; Function info:
; codeLenInByte = 4
define internal fastcc void @unreachable() {
%fptr = load ptr, ptr addrspace(4) @gv.fptr0
call void %fptr()
unreachable
}
; GCN-LABEL: entry:
; GCN-NOT: s_swappc_b64
; GCN: s_endpgm