
After replacing VGPR MFMAs with the AGPR form, we've alleviated VGPR pressure which may have triggered spills during allocation. Identify these spill slots, and try to reassign them to newly freed VGPRs, and replace the spill instructions with copies. Fixes #154260
425 lines
28 KiB
LLVM
425 lines
28 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-mfma-vgpr-form < %s | FileCheck %s
|
|
|
|
; After reassigning the MFMA to use AGPRs, we've alleviated enough
|
|
; register pressure to try eliminating the spill of %spill with the freed
|
|
; up VGPR.
|
|
define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg, ptr addrspace(1) inreg %ptr) #0 {
|
|
; CHECK-LABEL: eliminate_spill_after_mfma_rewrite:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a3, v5
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a2, v4
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a36, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a37, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a38, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a48, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a49, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a50, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a51, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a52, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a53, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a54, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a55, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a63, off, s[0:3], s32 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3]
|
|
; CHECK-NEXT: ;;#ASMSTART
|
|
; CHECK-NEXT: ; def v[32:63], v[0:31]
|
|
; CHECK-NEXT: ;;#ASMEND
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a63, v31
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a62, v30
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a61, v29
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a60, v28
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a59, v27
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a58, v26
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a57, v25
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a56, v24
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a55, v23
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a54, v22
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a53, v21
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a52, v20
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a51, v19
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a50, v18
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a49, v17
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a48, v16
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a47, v15
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a46, v14
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a45, v13
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a44, v12
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a43, v11
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a42, v10
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a41, v9
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a40, v8
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a39, v7
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a38, v6
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a37, v5
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a36, v4
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a35, v3
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a34, v2
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a33, v1
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a32, v0
|
|
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
|
|
; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
|
|
; CHECK-NEXT: ;;#ASMSTART
|
|
; CHECK-NEXT: ; def v[10:13]
|
|
; CHECK-NEXT: ;;#ASMEND
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: ;;#ASMSTART
|
|
; CHECK-NEXT: ; def a[0:31]
|
|
; CHECK-NEXT: ;;#ASMEND
|
|
; CHECK-NEXT: ;;#ASMSTART
|
|
; CHECK-NEXT: ; def a[0:31]
|
|
; CHECK-NEXT: ;;#ASMEND
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[60:63], s[16:17] offset:112
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[56:59], s[16:17] offset:96
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[52:55], s[16:17] offset:80
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[48:51], s[16:17] offset:64
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[44:47], s[16:17] offset:48
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[40:43], s[16:17] offset:32
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[36:39], s[16:17] offset:16
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[32:35], s[16:17]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[56:59], s[16:17] offset:96
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[60:63], s[16:17] offset:112
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[48:51], s[16:17] offset:64
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[52:55], s[16:17] offset:80
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[40:43], s[16:17] offset:32
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[44:47], s[16:17] offset:48
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[32:35], s[16:17]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a55, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a54, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a53, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a52, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a51, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a50, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a49, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a48, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a39, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a38, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a36, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a35, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a34, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %x, i32 %y, <4 x i32> %arg, i32 0, i32 0, i32 0)
|
|
%v = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0, $1", "=v,=v"()
|
|
%v0 = extractvalue { <32 x i32>, <32 x i32> } %v, 0
|
|
%v1 = extractvalue { <32 x i32>, <32 x i32> } %v, 1
|
|
%spill = call <4 x i32> asm sideeffect "; def $0", "=v,v"(<4 x i32> %mai)
|
|
%a0 = call <32 x i32> asm sideeffect "; def $0", "=a"()
|
|
%a1 = call <32 x i32> asm sideeffect "; def $0", "=a"()
|
|
store volatile <32 x i32> %v0, ptr addrspace(1) %ptr
|
|
store volatile <32 x i32> %v1, ptr addrspace(1) %ptr
|
|
store volatile <4 x i32> %spill, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
; Same, except we fold out 2 spills from %spill0 and %spill1
|
|
define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %arg, ptr addrspace(1) inreg %ptr) #0 {
|
|
; CHECK-LABEL: eliminate_spill_after_mfma_rewrite_x2:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a3, v5
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a2, v4
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a36, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a37, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a38, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a48, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a49, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a50, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a51, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a52, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a53, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a54, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a55, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: buffer_store_dword a63, off, s[0:3], s32 ; 4-byte Folded Spill
|
|
; CHECK-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3]
|
|
; CHECK-NEXT: ;;#ASMSTART
|
|
; CHECK-NEXT: ; def v[32:63], v[0:31]
|
|
; CHECK-NEXT: ;;#ASMEND
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a63, v31
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a62, v30
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a61, v29
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a60, v28
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a59, v27
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a58, v26
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a57, v25
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a56, v24
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a55, v23
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a54, v22
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a53, v21
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a52, v20
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a51, v19
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a50, v18
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a49, v17
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a48, v16
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a47, v15
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a46, v14
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a45, v13
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a44, v12
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a43, v11
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a42, v10
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a41, v9
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a40, v8
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a39, v7
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a38, v6
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a37, v5
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a36, v4
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a35, v3
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a34, v2
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a33, v1
|
|
; CHECK-NEXT: v_accvgpr_write_b32 a32, v0
|
|
; CHECK-NEXT: v_accvgpr_read_b32 v7, a3
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: v_accvgpr_read_b32 v6, a2
|
|
; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; CHECK-NEXT: ;;#ASMSTART
|
|
; CHECK-NEXT: ; def v[14:17]
|
|
; CHECK-NEXT: ;;#ASMEND
|
|
; CHECK-NEXT: ;;#ASMSTART
|
|
; CHECK-NEXT: ; def v[10:13]
|
|
; CHECK-NEXT: ;;#ASMEND
|
|
; CHECK-NEXT: ;;#ASMSTART
|
|
; CHECK-NEXT: ; def a[0:31]
|
|
; CHECK-NEXT: ;;#ASMEND
|
|
; CHECK-NEXT: ;;#ASMSTART
|
|
; CHECK-NEXT: ; def a[0:31]
|
|
; CHECK-NEXT: ;;#ASMEND
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[60:63], s[16:17] offset:112
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[56:59], s[16:17] offset:96
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[52:55], s[16:17] offset:80
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[48:51], s[16:17] offset:64
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[44:47], s[16:17] offset:48
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[40:43], s[16:17] offset:32
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[36:39], s[16:17] offset:16
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[32:35], s[16:17]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[56:59], s[16:17] offset:96
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[60:63], s[16:17] offset:112
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[48:51], s[16:17] offset:64
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[52:55], s[16:17] offset:80
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[40:43], s[16:17] offset:32
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[44:47], s[16:17] offset:48
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[32:35], s[16:17]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a55, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a54, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a53, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a52, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a51, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a50, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a49, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a48, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a39, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a38, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a36, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a35, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a34, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %x, i32 %y, <4 x i32> %arg, i32 0, i32 0, i32 0)
|
|
%v = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0, $1", "=v,=v"()
|
|
%v0 = extractvalue { <32 x i32>, <32 x i32> } %v, 0
|
|
%v1 = extractvalue { <32 x i32>, <32 x i32> } %v, 1
|
|
%spill0 = call <4 x i32> asm sideeffect "; def $0", "=v,v"(<4 x i32> %mai)
|
|
%spill1 = call <4 x i32> asm sideeffect "; def $0", "=v,v"(<4 x i32> %mai)
|
|
%a0 = call <32 x i32> asm sideeffect "; def $0", "=a"()
|
|
%a1 = call <32 x i32> asm sideeffect "; def $0", "=a"()
|
|
store volatile <32 x i32> %v0, ptr addrspace(1) %ptr
|
|
store volatile <32 x i32> %v1, ptr addrspace(1) %ptr
|
|
store volatile <4 x i32> %spill0, ptr addrspace(1) %ptr
|
|
store volatile <4 x i32> %spill1, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32 immarg, i32 immarg, i32 immarg) #1
|
|
|
|
attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
|
|
attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
|