[AMDGPU] Use larger immediate values in S_NOP (#158990)

The S_NOP instruction has an immediate operand which is one less than
the number of cycles to delay for. The maximum value that may be encoded
in this field was increased in GFX8 and again in GFX12.
This commit is contained in:
Jay Foad 2025-09-16 15:51:06 +01:00 committed by GitHub
parent a42aac5f83
commit eeced0d073
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
31 changed files with 2584 additions and 3452 deletions

View File

@ -1839,6 +1839,16 @@ public:
/// \returns true if the subtarget requires a wait for xcnt before atomic
/// flat/global stores & rmw.
bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
/// \returns the number of significant bits in the immediate field of the
/// S_NOP instruction.
unsigned getSNopBits() const {
if (getGeneration() >= AMDGPUSubtarget::GFX12)
return 7;
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return 4;
return 3;
}
};
class GCNUserSGPRUsageInfo {

View File

@ -1932,8 +1932,9 @@ void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned Quantity) const {
DebugLoc DL = MBB.findDebugLoc(MI);
unsigned MaxSNopCount = 1u << ST.getSNopBits();
while (Quantity > 0) {
unsigned Arg = std::min(Quantity, 8u);
unsigned Arg = std::min(Quantity, MaxSNopCount);
Quantity -= Arg;
BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
}

View File

@ -58,8 +58,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 1
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
@ -109,8 +108,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 1
; GCN-NEXT: s_nop 9
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@ -185,8 +183,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 1
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@ -220,8 +217,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 1
; GCN-NEXT: s_nop 9
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GCN-NEXT: s_endpgm
bb:
@ -277,8 +273,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
@ -302,8 +297,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@ -336,8 +330,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@ -369,8 +362,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16

View File

@ -9,8 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
; GCN-NOT: v_accvgpr_write
; GCN: v_mfma_f32_32x32x1f32
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 2
; GCN-NOT: v_accvgpr_read
; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
@ -28,8 +27,7 @@ bb:
; GCN: global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}]
; GCN-NOT: v_accvgpr_read
; GCN: v_mfma_f32_32x32x1f32 a[[[N:[0-9]+]]:
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 2
; GCN-NOT: v_accvgpr_read
; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}]
@ -80,8 +78,7 @@ bb:
; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
; GCN-COUNT-32: v_accvgpr_write
; GCN: v_mfma_f32_32x32x1f32
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 2
; GCN-NOT: v_accvgpr_read
; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}]

View File

@ -63,8 +63,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-NEXT: v_accvgpr_write_b32 a16, v39
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: s_nop 9
; GFX908-NEXT: v_accvgpr_read_b32 v39, a0 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse
@ -181,8 +180,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
@ -487,8 +485,7 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 {
; GFX90A-NEXT: ; copy
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_write_b32 a32, v35 ; Reload Reuse
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: s_nop 9
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a3 v[0:31]
@ -965,8 +962,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-NEXT: v_accvgpr_write_b32 a16, v39
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: s_nop 9
; GFX908-NEXT: v_accvgpr_read_b32 v39, a0 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse
@ -1084,8 +1080,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v34, a32 ; Reload Reuse
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill

View File

@ -63,52 +63,41 @@ body: |
; GCN16-NEXT: successors: %bb.1(0x80000000)
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_BRANCH %bb.1
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: bb.1:
; GCN16-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: bb.2:
; GCN16-NEXT: successors: %bb.3(0x80000000)
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_NOP 0
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_BRANCH %bb.3
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: bb.3:
; GCN16-NEXT: liveins: $sgpr10_sgpr11
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 7
; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: SI_RETURN
bb.0:
liveins: $sgpr6, $sgpr10_sgpr11

View File

@ -87,8 +87,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mov_b32_e32 v0, 2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
@ -191,8 +190,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@ -256,8 +254,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mov_b32_e32 v1, 2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: s_nop 9
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
@ -308,8 +305,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: s_nop 9
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@ -424,8 +420,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mov_b32_e32 v1, 2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
@ -476,8 +471,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@ -513,8 +507,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: s_nop 9
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
@ -538,8 +531,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
bb:

View File

@ -59,8 +59,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
; GFX90A-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
@ -117,8 +116,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
; GFX942-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
@ -175,8 +173,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v31, s15
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x4bf16_1k v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 2
; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
@ -233,8 +230,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 2
; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
@ -283,8 +279,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
; GFX90A-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
@ -319,8 +314,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: s_nop 10
; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
@ -347,8 +341,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x4bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 2
; GFX90A-VGPR-NEXT: s_nop 10
; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
@ -375,8 +368,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 2
; GFX942-VGPR-NEXT: s_nop 10
; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
@ -505,8 +497,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
@ -542,8 +533,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f32_32x32x8_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: s_nop 10
; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
@ -570,8 +560,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x8bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 2
; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
@ -599,8 +588,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 2
; GFX942-VGPR-NEXT: s_nop 10
; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
@ -632,8 +620,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
;
@ -671,8 +658,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 2
; GFX90A-VGPR-NEXT: s_nop 10
; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
; GFX90A-VGPR-NEXT: s_endpgm
;
@ -795,8 +781,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
@ -823,8 +808,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
@ -847,8 +831,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 0
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
@ -871,8 +854,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
@ -896,8 +878,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -914,8 +895,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 0
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -932,8 +912,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 0
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 0
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
@ -950,8 +929,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 0
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
@ -975,8 +953,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -993,8 +970,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1011,8 +987,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 0
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
@ -1029,8 +1004,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
@ -1054,8 +1028,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 1.0
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1072,8 +1045,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 1.0
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1090,8 +1062,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 1.0
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 0
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
@ -1108,8 +1079,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 1.0
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
@ -1133,8 +1103,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1.0
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1151,8 +1120,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1.0
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1169,8 +1137,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1.0
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 0
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
@ -1187,8 +1154,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1.0
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
@ -1212,8 +1178,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 64
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1230,8 +1195,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 64
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1248,8 +1212,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 64
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 0
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
@ -1266,8 +1229,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 64
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
@ -1299,8 +1261,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1325,8 +1286,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1354,8 +1314,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
@ -1383,8 +1342,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
@ -1416,8 +1374,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1442,8 +1399,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1468,8 +1424,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 0
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
@ -1494,8 +1449,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
@ -1527,8 +1481,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1553,8 +1506,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1579,8 +1531,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 0
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
@ -1605,8 +1556,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
@ -1639,8 +1589,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1666,8 +1615,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1695,8 +1643,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
@ -1724,8 +1671,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
@ -1757,8 +1703,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1784,8 +1729,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
@ -1813,8 +1757,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
@ -1842,8 +1785,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]

File diff suppressed because it is too large Load Diff

View File

@ -178,8 +178,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b
; GCN-NEXT: v_accvgpr_write_b32 a15, v23
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -223,8 +222,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0,
; GCN-NEXT: v_accvgpr_write_b32 a15, v23
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -394,8 +392,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 2
; GCN-NEXT: s_nop 10
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -428,8 +425,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 2
; GCN-NEXT: s_nop 10
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16

View File

@ -479,8 +479,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: s_nop 8
; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
@ -598,8 +597,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: s_nop 8
; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
@ -864,8 +862,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: s_nop 8
; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
@ -983,8 +980,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: s_nop 8
; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
@ -1169,8 +1165,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
; GCN-NEXT: v_accvgpr_write_b32 a15, v23
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -1210,8 +1205,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 3
; HEURRC-NEXT: s_nop 11
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
@ -1234,8 +1228,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23]
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
; VGPRRC-NEXT: s_nop 11
; VGPRRC-NEXT: v_mov_b32_e32 v0, v8
; VGPRRC-NEXT: v_mov_b32_e32 v1, v9
; VGPRRC-NEXT: v_mov_b32_e32 v2, v10
@ -1342,8 +1335,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
; GCN-NEXT: v_accvgpr_write_b32 a15, v23
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -1383,8 +1375,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 3
; HEURRC-NEXT: s_nop 11
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
@ -1407,8 +1398,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
; VGPRRC-NEXT: s_nop 11
; VGPRRC-NEXT: v_mov_b32_e32 v0, v8
; VGPRRC-NEXT: v_mov_b32_e32 v1, v9
; VGPRRC-NEXT: v_mov_b32_e32 v2, v10
@ -2199,8 +2189,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
; SDAG-NEXT: s_nop 10
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -2228,8 +2217,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@ -2257,8 +2245,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
; HEURRC-NEXT: s_nop 10
; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -2286,8 +2273,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; VGPRRC-NEXT: s_nop 1
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 2
; VGPRRC-NEXT: s_nop 10
; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -2384,8 +2370,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
; SDAG-NEXT: s_nop 10
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -2413,8 +2398,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@ -2442,8 +2426,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
; HEURRC-NEXT: s_nop 10
; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -2471,8 +2454,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; VGPRRC-NEXT: s_nop 1
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 2
; VGPRRC-NEXT: s_nop 10
; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -3083,8 +3065,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: s_nop 8
; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
@ -3205,8 +3186,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; VGPRRC-NEXT: s_nop 1
; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31]
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
; VGPRRC-NEXT: s_nop 11
; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
@ -3497,8 +3477,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: s_nop 8
; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
@ -3619,8 +3598,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; VGPRRC-NEXT: s_nop 1
; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
; VGPRRC-NEXT: s_nop 11
; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
@ -3827,8 +3805,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
; GCN-NEXT: v_accvgpr_write_b32 a15, v23
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -3868,8 +3845,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 3
; HEURRC-NEXT: s_nop 11
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
@ -3892,8 +3868,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23]
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
; VGPRRC-NEXT: s_nop 11
; VGPRRC-NEXT: v_mov_b32_e32 v0, v8
; VGPRRC-NEXT: v_mov_b32_e32 v1, v9
; VGPRRC-NEXT: v_mov_b32_e32 v2, v10
@ -4000,8 +3975,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
; GCN-NEXT: v_accvgpr_write_b32 a15, v23
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -4041,8 +4015,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 3
; HEURRC-NEXT: s_nop 11
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
@ -4065,8 +4038,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
; VGPRRC-NEXT: s_nop 11
; VGPRRC-NEXT: v_mov_b32_e32 v0, v8
; VGPRRC-NEXT: v_mov_b32_e32 v1, v9
; VGPRRC-NEXT: v_mov_b32_e32 v2, v10
@ -4932,8 +4904,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
; SDAG-NEXT: s_nop 10
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -4961,8 +4932,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@ -4995,8 +4965,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
; HEURRC-NEXT: s_nop 10
; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -5029,8 +4998,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; VGPRRC-NEXT: s_nop 1
; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 2
; VGPRRC-NEXT: s_nop 10
; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -5142,8 +5110,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
; SDAG-NEXT: s_nop 10
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -5171,8 +5138,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@ -5205,8 +5171,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
; HEURRC-NEXT: s_nop 10
; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -5239,8 +5204,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; VGPRRC-NEXT: s_nop 1
; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 2
; VGPRRC-NEXT: s_nop 10
; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16

View File

@ -50,8 +50,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mov_b32_e32 v1, 2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
@ -103,8 +102,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@ -138,8 +136,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: s_nop 9
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
@ -163,8 +160,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_i32_16x16x16i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
bb:

View File

@ -97,8 +97,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
@ -233,8 +232,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
@ -337,8 +335,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@ -394,8 +391,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@ -451,8 +447,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
@ -514,8 +509,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: s_nop 9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
@ -582,8 +576,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: s_nop 9
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
@ -634,8 +627,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: s_nop 9
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@ -669,8 +661,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: s_nop 8
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@ -696,8 +687,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: s_nop 8
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@ -872,8 +862,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
@ -940,8 +929,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
@ -992,8 +980,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@ -1028,8 +1015,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@ -1056,8 +1042,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x2_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
@ -1091,8 +1076,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5
; NOLIT-SRCC-NEXT: s_nop 0
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: s_nop 9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
@ -1120,8 +1104,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5
; LIT-SRCC-NEXT: s_nop 0
; LIT-SRCC-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: s_nop 9
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
@ -1145,8 +1128,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
;
@ -1165,8 +1147,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f32_16x16x4_f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: s_nop 9
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
;
@ -1183,8 +1164,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: s_nop 9
; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
; GFX942-VGPR-NEXT: s_endpgm
bb:
@ -1275,8 +1255,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s3
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
@ -1415,8 +1394,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s3
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
@ -1523,8 +1501,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-NEXT: v_mov_b32_e32 v5, s3
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
@ -1584,8 +1561,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: v_mov_b32_e32 v5, s3
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
@ -1645,8 +1621,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-VGPR-NEXT: v_mov_b32_e32 v37, s3
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_f16 v[0:31], v[34:35], v[36:37], v[0:31] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 2
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[36:37] offset:96
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[36:37] offset:112
@ -1714,8 +1689,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s23
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: s_nop 9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
@ -1785,8 +1759,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s23
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: s_nop 9
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
@ -1840,8 +1813,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: s_nop 9
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@ -1878,8 +1850,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f32_16x16x4_4b_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: s_nop 9
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@ -1908,8 +1879,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: s_nop 9
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@ -2108,8 +2078,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s23
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
@ -2179,8 +2148,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s23
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
@ -2234,8 +2202,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@ -2273,8 +2240,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: s_nop 9
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@ -2303,8 +2269,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: s_nop 9
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@ -2343,8 +2308,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: s_nop 9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
@ -2375,8 +2339,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: s_nop 9
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
@ -2403,8 +2366,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
@ -2536,8 +2498,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26
@ -2658,8 +2619,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26
@ -2748,8 +2708,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@ -2805,8 +2764,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_i32_32x32x4_2b_i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@ -2862,8 +2820,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_i32_32x32x4_2b_i8 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 2
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
@ -2925,8 +2882,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: s_nop 9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
@ -2993,8 +2949,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: s_nop 9
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
@ -3045,8 +3000,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: s_nop 9
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@ -3080,8 +3034,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: s_nop 9
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@ -3107,8 +3060,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: s_nop 9
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@ -3145,8 +3097,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: s_nop 9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
@ -3177,8 +3128,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: s_nop 9
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
@ -3211,8 +3161,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: s_nop 8
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@ -3228,8 +3177,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: s_nop 8
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@ -3244,8 +3192,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: s_nop 9
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -3645,8 +3592,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; NOLIT-SRCC-NEXT: s_nop 0
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
@ -3782,8 +3728,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; LIT-SRCC-NEXT: s_nop 0
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
@ -3887,8 +3832,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@ -3945,8 +3889,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@ -4003,8 +3946,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
@ -4068,8 +4010,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: s_nop 9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
@ -4136,8 +4077,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: s_nop 9
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
@ -4188,8 +4128,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: s_nop 9
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@ -4224,8 +4163,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: s_nop 8
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@ -4252,8 +4190,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: s_nop 8
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@ -4502,8 +4439,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: s_nop 9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
@ -4541,8 +4477,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; LIT-SRCC-NEXT: v_mov_b32_e32 v8, 0
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: s_nop 9
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
@ -4578,8 +4513,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: s_nop 8
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@ -4610,8 +4544,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: s_nop 8
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -4649,8 +4582,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15]
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
@ -4691,8 +4623,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; LIT-SRCC-NEXT: v_mov_b32_e32 v13, 0
; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
@ -4730,8 +4661,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; GFX90A-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@ -4750,8 +4680,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; GFX942-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], 1.0
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: s_nop 8
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@ -4768,8 +4697,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: s_nop 9
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -4821,8 +4749,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
@ -4889,8 +4816,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; LIT-SRCC-NEXT: v_mov_b32_e32 v14, 0
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
@ -4948,8 +4874,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
@ -4970,8 +4895,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -4990,8 +4914,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
@ -5131,8 +5054,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: s_nop 9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
@ -5186,8 +5108,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: s_nop 9
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
@ -5242,8 +5163,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: s_nop 9
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@ -5274,8 +5194,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v2, a[0:15]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: s_nop 8
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@ -5304,8 +5223,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v15, v[0:15]
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: s_nop 8
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@ -5357,8 +5275,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
@ -5457,8 +5374,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
@ -5558,8 +5474,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
@ -5611,8 +5526,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
@ -5679,8 +5593,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
@ -5965,8 +5878,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
@ -6061,8 +5973,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
@ -6125,8 +6036,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; GFX90A-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
@ -6156,8 +6066,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; GFX942-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
@ -6187,8 +6096,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; GFX942-VGPR-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; GFX942-VGPR-NEXT: s_waitcnt vmcnt(0)
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112

View File

@ -23,8 +23,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -47,8 +46,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -71,8 +69,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -95,8 +92,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -119,8 +115,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -143,8 +138,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -167,8 +161,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -191,8 +184,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -216,8 +208,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -241,8 +232,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -266,8 +256,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -291,8 +280,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -316,8 +304,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -341,8 +328,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -366,8 +352,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -391,8 +376,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -416,8 +400,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -441,8 +424,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -466,8 +448,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -491,8 +472,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -517,8 +497,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -542,8 +521,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -566,8 +544,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -591,8 +568,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -616,8 +592,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -641,8 +616,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -666,8 +640,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -691,8 +664,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -716,8 +688,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -741,8 +712,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -766,8 +736,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -888,8 +857,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -913,8 +881,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -938,8 +905,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -963,8 +929,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -1180,8 +1145,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -1205,8 +1169,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -1230,8 +1193,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -1255,8 +1217,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -1429,8 +1390,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_
; GCN-NEXT: v_mov_b32_e32 v17, s1
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -1451,8 +1411,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_
; GCN-NEXT: v_mov_b32_e32 v16, s0
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -1473,8 +1432,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_
; GCN-NEXT: v_mov_b32_e32 v16, s0
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -1512,8 +1470,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@ -1543,8 +1500,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
; GISEL-NEXT: v_accvgpr_write_b32 a3, v1
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@ -1573,8 +1529,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@ -1599,8 +1554,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
; GISEL-NEXT: v_mov_b32_e32 v8, s20
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@ -1629,8 +1583,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@ -1655,8 +1608,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; GISEL-NEXT: v_mov_b32_e32 v8, s20
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@ -1685,8 +1637,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@ -1711,8 +1662,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; GISEL-NEXT: v_mov_b32_e32 v8, s20
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@ -1733,8 +1683,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp
; GCN-NEXT: v_mov_b32_e32 v17, s16
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -1763,8 +1712,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v9, s24
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@ -1789,8 +1737,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; GISEL-NEXT: v_mov_b32_e32 v9, s24
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@ -1812,8 +1759,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@ -1831,8 +1777,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
; GISEL-NEXT: v_mov_b32_e32 v17, -2
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@ -1854,8 +1799,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@ -1873,8 +1817,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: v_mov_b32_e32 v17, -2
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@ -1896,8 +1839,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@ -1915,8 +1857,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@ -1958,8 +1899,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; SDAG-NEXT: v_mov_b32_e32 v22, s13
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v22 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15]
; SDAG-NEXT: s_endpgm
;
@ -1983,8 +1923,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
@ -2022,8 +1961,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
@ -2048,8 +1986,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
@ -2087,8 +2024,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
@ -2113,8 +2049,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
@ -2152,8 +2087,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
@ -2178,8 +2112,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
@ -2217,8 +2150,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
@ -2243,8 +2175,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
@ -2263,8 +2194,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -2285,8 +2215,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -2308,8 +2237,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@ -2327,8 +2255,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8
; GISEL-NEXT: v_mov_b32_e32 v17, 1
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@ -2350,8 +2277,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@ -2369,8 +2295,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(
; GISEL-NEXT: v_mov_b32_e32 v17, 0
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@ -2394,8 +2319,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -2418,8 +2342,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -2488,8 +2411,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -2512,8 +2434,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -2536,8 +2457,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@ -2560,8 +2480,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2

View File

@ -133,8 +133,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
; GFX942-SDAG-NEXT: s_nop 1
; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-SDAG-NEXT: s_nop 7
; GFX942-SDAG-NEXT: s_nop 1
; GFX942-SDAG-NEXT: s_nop 9
; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@ -172,8 +171,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 7
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@ -208,8 +206,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32_vgprcd(ptr addrspace(1) %ar
; GFX942-SDAG-NEXT: s_nop 1
; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_nop 7
; GFX942-SDAG-NEXT: s_nop 1
; GFX942-SDAG-NEXT: s_nop 9
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@ -239,8 +236,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32_vgprcd(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 7
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32

View File

@ -28,8 +28,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MINREG-NEXT: v_add_u32_e32 v3, 0x6000, v4
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: ds_write_b128 v5, a[28:31] offset:112
; GCN-MINREG-NEXT: ds_write_b128 v5, a[24:27] offset:96
; GCN-MINREG-NEXT: ds_write_b128 v5, a[20:23] offset:80
@ -51,8 +50,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:8288
; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:8304
@ -75,8 +73,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:16480
; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:16496
@ -99,8 +96,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:24672
; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:24688
@ -123,8 +119,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:32864
; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:32880
@ -159,8 +154,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MAXOCC-NEXT: v_add_u32_e32 v1, s1, v1
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 1
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:112
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:96
@ -184,8 +178,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 1
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:8288
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:8304
@ -208,8 +201,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 2
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:16480
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:16496
@ -233,8 +225,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 1
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:24672
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:24688
@ -257,8 +248,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 2
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:32864
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:32880
@ -293,8 +283,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-ILP-NEXT: v_add_u32_e32 v0, s1, v0
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 1
; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:112
; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:96
@ -315,8 +304,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
; GCN-ILP-NEXT: v_mov_b32_e32 v0, s1
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 1
; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:8288
; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:8304
@ -336,8 +324,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 2
; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16400
; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168
@ -358,8 +345,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 1
; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:24592
; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360
@ -383,8 +369,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 2
; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:32864
; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:32880
@ -488,8 +473,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-NEXT: v_add_u32_e32 v2, s1, v2
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 1
; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:112
; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:96
@ -513,8 +497,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 1
; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:8288
; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:8304
@ -539,8 +522,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v3
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 1
; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:16496
; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:16480
@ -563,8 +545,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:24688
; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:24672
@ -587,8 +568,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:32880
; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:32864
@ -623,8 +603,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MAXOCC-NEXT: v_add_u32_e32 v3, s1, v3
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 1
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:112
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:96
@ -648,8 +627,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 1
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:8288
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:8304
@ -673,8 +651,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 2
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:16496
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:16480
@ -698,8 +675,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 1
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:24688
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:24672
@ -722,8 +698,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 2
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:32880
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:32864
@ -758,8 +733,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-ILP-NEXT: v_add_u32_e32 v2, s1, v2
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 1
; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3]
; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192
@ -783,8 +757,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 1
; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:8288
; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:8304
@ -808,8 +781,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 2
; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:16496
; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:16480
@ -830,8 +802,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 1
; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:24576
; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344
@ -855,8 +826,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 2
; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:32880
; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:32864

View File

@ -678,8 +678,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 4
; GCN-NEXT: s_nop 12
; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112
; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96
; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80
@ -785,8 +784,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 4
; EXACTCUTOFF-NEXT: s_nop 12
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80
@ -890,8 +888,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 1
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
@ -915,8 +912,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 1
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304
@ -939,8 +935,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 2
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:16480
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:16496
@ -964,8 +959,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 1
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688
@ -988,8 +982,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 2
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880
@ -1024,8 +1017,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 15
; EXACTCUTOFF-NEXT: s_nop 1
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96
@ -1049,8 +1041,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 15
; EXACTCUTOFF-NEXT: s_nop 1
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304
@ -1073,8 +1064,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 15
; EXACTCUTOFF-NEXT: s_nop 2
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:16480
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:16496
@ -1098,8 +1088,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 15
; EXACTCUTOFF-NEXT: s_nop 1
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:24672
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:24688
@ -1122,8 +1111,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 15
; EXACTCUTOFF-NEXT: s_nop 2
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880

View File

@ -199,8 +199,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
; SDAG-NEXT: s_nop 10
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
@ -232,8 +231,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
@ -253,8 +251,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -316,8 +313,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -379,8 +375,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -471,8 +466,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
; SDAG-NEXT: v_mov_b32_e32 v27, v9
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -685,8 +679,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 2
; GCN-NEXT: s_nop 10
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
@ -706,8 +699,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
@ -734,8 +726,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
@ -762,8 +753,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
@ -819,8 +809,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
; GCN-NEXT: v_mov_b32_e32 v27, v9
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
@ -1049,8 +1038,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
; SDAG-NEXT: s_nop 10
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
@ -1082,8 +1070,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@ -1103,8 +1090,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -1166,8 +1152,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -1229,8 +1214,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -1321,8 +1305,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; SDAG-NEXT: v_mov_b32_e32 v27, v9
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -2098,8 +2081,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
; SDAG-NEXT: s_nop 10
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
@ -2131,8 +2113,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@ -2152,8 +2133,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -2215,8 +2195,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -2278,8 +2257,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -2370,8 +2348,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v27, v9
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -2471,8 +2448,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
; SDAG-NEXT: s_nop 10
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
@ -2504,8 +2480,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@ -2525,8 +2500,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -2588,8 +2562,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -2651,8 +2624,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -2743,8 +2715,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v27, v9
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -2844,8 +2815,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
; SDAG-NEXT: s_nop 10
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
@ -2877,8 +2847,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@ -2898,8 +2867,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -2961,8 +2929,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -3024,8 +2991,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -3116,8 +3082,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v27, v9
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -3217,8 +3182,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
; SDAG-NEXT: s_nop 10
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
@ -3250,8 +3214,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: s_nop 10
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
@ -3271,8 +3234,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -3334,8 +3296,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -3397,8 +3358,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14
@ -3489,8 +3449,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v27, v9
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v12
; SDAG-NEXT: v_mov_b32_e32 v1, v13
; SDAG-NEXT: v_mov_b32_e32 v2, v14

View File

@ -125,8 +125,7 @@ body: |
...
# GCN-LABEL: name: sgemm32x32_mfma_write_agpr_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: V_MFMA
name: sgemm32x32_mfma_write_agpr_mfma_read_overlap
body: |
@ -136,8 +135,7 @@ body: |
...
# GCN-LABEL: name: sgemm32x32_mfma_write_vgpr_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: V_MFMA
name: sgemm32x32_mfma_write_vgpr_mfma_read_overlap
body: |
@ -147,8 +145,7 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 0
# GCN-NEXT: S_NOP 8
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap
body: |
@ -196,8 +193,7 @@ body: |
...
# GCN-LABEL: name: sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 0
# GCN-NEXT: S_NOP 8
# GCN-NEXT: V_MFMA
name: sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
body: |
@ -207,8 +203,7 @@ body: |
...
# GCN-LABEL: name: sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 0
# GCN-NEXT: V_MFMA
name: sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
@ -249,8 +244,7 @@ body: |
...
# GCN-LABEL: name: sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_MFMA
name: sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
body: |
@ -260,8 +254,7 @@ body: |
...
# GCN-LABEL: name: sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 2
# GCN-NEXT: V_MFMA
name: sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
@ -312,8 +305,7 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
body: |
@ -333,8 +325,7 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
body: |
@ -384,8 +375,7 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
body: |
@ -435,8 +425,7 @@ body: |
...
# GCN-LABEL: name: smfma16x16_write_vgpr_flat_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: FLAT_STORE_DWORD
name: smfma16x16_write_vgpr_flat_read
body: |
@ -446,8 +435,7 @@ body: |
...
# GCN-LABEL: name: smfma32x32_write_vgpr_flat_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 2
# GCN-NEXT: FLAT_STORE_DWORD
name: smfma32x32_write_vgpr_flat_read
@ -458,8 +446,7 @@ body: |
...
# GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 0
# GCN-NEXT: S_NOP 8
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma4x4_write_vgpr_flat_read_overlap
body: |
@ -469,8 +456,7 @@ body: |
...
# GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_full
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 0
# GCN-NEXT: S_NOP 8
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma4x4_write_vgpr_flat_read_full
body: |
@ -480,8 +466,7 @@ body: |
...
# GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 1
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma16x16_write_vgpr_flat_read
@ -502,8 +487,7 @@ body: |
...
# GCN-LABEL: name: smfma16x16_write_vgpr_valu_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_MOV_B32
name: smfma16x16_write_vgpr_valu_read
body: |
@ -513,8 +497,7 @@ body: |
...
# GCN-LABEL: name: smfma32x32_write_vgpr_valu_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 2
# GCN-NEXT: V_MOV_B32
name: smfma32x32_write_vgpr_valu_read
@ -535,8 +518,7 @@ body: |
...
# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_MOV_B32
name: dmfma16x16_write_vgpr_valu_read
body: |
@ -556,8 +538,7 @@ body: |
...
# GCN-LABEL: name: smfma16x16_write_vgpr_accv_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
name: smfma16x16_write_vgpr_accv_read
body: |
@ -567,8 +548,7 @@ body: |
...
# GCN-LABEL: name: smfma32x32_write_vgpr_accv_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 2
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
name: smfma32x32_write_vgpr_accv_read
@ -599,8 +579,7 @@ body: |
...
# GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_DOT
name: dmfma16x16_write_vgpr_dot_read
body: |
@ -620,8 +599,7 @@ body: |
...
# GCN-LABEL: name: smfma16x16_write_vgpr_valu_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_MOV_B32
name: smfma16x16_write_vgpr_valu_write
body: |
@ -631,8 +609,7 @@ body: |
...
# GCN-LABEL: name: smfma32x32_write_vgpr_valu_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 2
# GCN-NEXT: V_MOV_B32
name: smfma32x32_write_vgpr_valu_write
@ -653,8 +630,7 @@ body: |
...
# GCN-LABEL: name: smfma16x16_write_vgpr_valu_f16_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_FMA_F16_e64
name: smfma16x16_write_vgpr_valu_f16_write
body: |
@ -664,8 +640,7 @@ body: |
...
# GCN-LABEL: name: smfma32x32_write_vgpr_valu_f16_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 2
# GCN-NEXT: V_FMA_F16_e64
name: smfma32x32_write_vgpr_valu_f16_write
@ -686,8 +661,7 @@ body: |
...
# GCN-LABEL: name: smfma16x16_write_vgpr_valu_sdwa_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_MOV_B32_sdwa
name: smfma16x16_write_vgpr_valu_sdwa_write
body: |
@ -697,8 +671,7 @@ body: |
...
# GCN-LABEL: name: smfma32x32_write_vgpr_valu_sdwa_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 2
# GCN-NEXT: V_MOV_B32_sdwa
name: smfma32x32_write_vgpr_valu_sdwa_write
@ -719,8 +692,7 @@ body: |
...
# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_MOV_B32
name: dmfma16x16_write_vgpr_valu_write
body: |
@ -770,8 +742,7 @@ body: |
...
# GCN-LABEL: name: smfma32x32_read_srcc_vgpr_valu_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 6
# GCN-NEXT: S_NOP 14
# GCN-NEXT: V_MOV_B32
name: smfma32x32_read_srcc_vgpr_valu_write
body: |
@ -1040,8 +1011,7 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 0
# GCN-NEXT: S_NOP 8
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_agpr_mfma_read_overlap
body: |
@ -1080,8 +1050,7 @@ body: |
...
# GCN-LABEL: name: sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 0
# GCN-NEXT: S_NOP 8
# GCN-NEXT: V_MFMA
name: sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
body: |
@ -1091,8 +1060,7 @@ body: |
...
# GCN-LABEL: name: sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 0
# GCN-NEXT: V_MFMA
name: sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
@ -1133,8 +1101,7 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
body: |
@ -1154,8 +1121,7 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
body: |
@ -1185,8 +1151,7 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
body: |
@ -1196,8 +1161,7 @@ body: |
...
# GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 0
# GCN-NEXT: S_NOP 8
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma4x4_write_agpr_flat_read_overlap
body: |
@ -1207,8 +1171,7 @@ body: |
...
# GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_full
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 0
# GCN-NEXT: S_NOP 8
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma4x4_write_agpr_flat_read_full
body: |
@ -1218,8 +1181,7 @@ body: |
...
# GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 1
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma16x16_write_agpr_flat_read
@ -1240,8 +1202,7 @@ body: |
...
# GCN-LABEL: name: dmfma16x16_write_agpr_valu_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_ACCVGPR_READ_B32_e64
name: dmfma16x16_write_agpr_valu_read
body: |
@ -1261,8 +1222,7 @@ body: |
...
# GCN-LABEL: name: dmfma16x16_write_agpr_valu_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
name: dmfma16x16_write_agpr_valu_write
body: |

View File

@ -178,11 +178,8 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 1
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 9
# GCN-NEXT: V_MFMA
name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_overlap
body: |
@ -192,11 +189,8 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 1
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 9
# GCN-NEXT: V_MFMA
name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_overlap
body: |
@ -225,11 +219,8 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_smfmac_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 1
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 9
# GCN-NEXT: V_SMFMAC
name: xdl_sgemm16x16_mfma_write_agpr_smfmac_read_overlap
body: |
@ -239,8 +230,7 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GCN-NEXT: V_MFMA
@ -252,8 +242,7 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_vgpr_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GCN-NEXT: V_MFMA
@ -274,8 +263,7 @@ body: |
...
# GCN-LABEL: name: nonxdl_sgemm32x32_mfma_write_agpr_nonxdl_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: V_MFMA
name: nonxdl_sgemm32x32_mfma_write_agpr_nonxdl_mfma_read_overlap
body: |
@ -285,8 +273,7 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_smfmac_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GCN-NEXT: V_SMFMAC
@ -298,11 +285,8 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 0
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap
@ -323,11 +307,8 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 0
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap
@ -358,9 +339,8 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 9
# GCN-NEXT: V_MFMA
name: xdl_sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
body: |
@ -370,8 +350,7 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GCN-NEXT: V_MFMA
@ -383,9 +362,8 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_partial
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 9
# GCN-NEXT: V_MFMA
name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_partial
body: |
@ -395,9 +373,8 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_partial
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 9
# GCN-NEXT: V_MFMA
name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_partial
body: |
@ -417,9 +394,8 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: V_MFMA
name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
body: |
@ -429,8 +405,7 @@ body: |
...
# GCN-LABEL: name: nonxdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 1
# GCN-NEXT: S_NOP 9
# GCN-NEXT: V_MFMA
name: nonxdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
body: |
@ -440,9 +415,8 @@ body: |
...
# GCN-LABEL: name: smfmac32x32_write_agpr_mfma_srca_read_overlap
# GCN: V_SMFMAC
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: V_MFMA
name: smfmac32x32_write_agpr_mfma_srca_read_overlap
body: |
@ -452,9 +426,8 @@ body: |
...
# GCN-LABEL: name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap
# GCN: V_SMFMAC
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: V_SMFMAC
name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap
body: |
@ -464,8 +437,7 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
@ -477,8 +449,7 @@ body: |
...
# GCN-LABEL: name: nonxdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_MFMA
name: nonxdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
@ -539,11 +510,8 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 2
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
@ -564,11 +532,8 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 2
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
@ -639,11 +604,8 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 2
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
@ -654,11 +616,8 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_smfmac_srcb_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 2
# GCN-NEXT: V_SMFMAC
name: dgemm16x16_mfma_write_vgpr_smfmac_srcb_read_overlap
@ -669,11 +628,8 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_smfmac_srcc_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 2
# GCN-NEXT: V_SMFMAC
@ -746,9 +702,8 @@ body: |
...
# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_flat_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: FLAT_STORE_DWORD
name: xdl_smfma16x16_write_vgpr_flat_read
body: |
@ -758,9 +713,8 @@ body: |
...
# GCN-LABEL: name: smfmac32x32_write_vgpr_flat_read
# GCN: V_SMFMAC
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: FLAT_STORE_DWORD
name: smfmac32x32_write_vgpr_flat_read
body: |
@ -770,8 +724,7 @@ body: |
...
# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_flat_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GCN-NEXT: FLAT_STORE_DWORD
@ -783,8 +736,7 @@ body: |
...
# GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 0
# GCN-NEXT: S_NOP 8
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma4x4_write_vgpr_flat_read_overlap
body: |
@ -794,8 +746,7 @@ body: |
...
# GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_full
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 0
# GCN-NEXT: S_NOP 8
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma4x4_write_vgpr_flat_read_full
body: |
@ -805,8 +756,7 @@ body: |
...
# GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 1
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma16x16_write_vgpr_flat_read
@ -827,9 +777,8 @@ body: |
...
# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: V_MOV_B32
name: xdl_smfma16x16_write_vgpr_valu_read
body: |
@ -839,8 +788,7 @@ body: |
...
# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MOV_B32
@ -862,11 +810,8 @@ body: |
...
# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 2
# GCN-NEXT: V_MOV_B32
name: dmfma16x16_write_vgpr_valu_read
@ -887,9 +832,8 @@ body: |
...
# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_accv_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
name: xdl_smfma16x16_write_vgpr_accv_read
body: |
@ -899,8 +843,7 @@ body: |
...
# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_accv_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
@ -932,11 +875,8 @@ body: |
...
# GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 2
# GCN-NEXT: V_DOT
@ -958,9 +898,8 @@ body: |
...
# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: V_MOV_B32
name: xdl_smfma16x16_write_vgpr_valu_write
body: |
@ -970,8 +909,7 @@ body: |
...
# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MOV_B32
@ -993,9 +931,8 @@ body: |
...
# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_f16_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: V_FMA_F16_e64
name: xdl_smfma16x16_write_vgpr_valu_f16_write
body: |
@ -1005,8 +942,7 @@ body: |
...
# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_f16_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_FMA_F16_e64
@ -1028,9 +964,8 @@ body: |
...
# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_sdwa_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: V_MOV_B32_sdwa
name: xdl_smfma16x16_write_vgpr_valu_sdwa_write
body: |
@ -1040,8 +975,7 @@ body: |
...
# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_sdwa_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MOV_B32_sdwa
@ -1063,8 +997,7 @@ body: |
...
# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_MOV_B32
name: dmfma16x16_write_vgpr_valu_write
body: |
@ -1379,11 +1312,8 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 0
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_agpr_mfma_read_overlap
@ -1404,11 +1334,8 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 0
# GCN-NEXT: V_MFMA
@ -1430,9 +1357,8 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 9
# GCN-NEXT: V_MFMA
name: xdl_sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
body: |
@ -1442,8 +1368,7 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GCN-NEXT: V_MFMA
@ -1485,11 +1410,8 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 2
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
@ -1510,11 +1432,8 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 2
# GCN-NEXT: V_MFMA
@ -1546,11 +1465,8 @@ body: |
...
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 2
# GCN-NEXT: V_MFMA
name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
@ -1561,8 +1477,7 @@ body: |
...
# GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_overlap
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 0
# GCN-NEXT: S_NOP 8
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma4x4_write_agpr_flat_read_overlap
body: |
@ -1572,8 +1487,7 @@ body: |
...
# GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_full
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 0
# GCN-NEXT: S_NOP 8
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma4x4_write_agpr_flat_read_full
body: |
@ -1583,8 +1497,7 @@ body: |
...
# GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 1
# GCN-NEXT: FLAT_STORE_DWORD
name: dmfma16x16_write_agpr_flat_read
@ -1605,11 +1518,8 @@ body: |
...
# GCN-LABEL: name: dmfma16x16_write_agpr_valu_read
# GCN: V_MFMA
# GFX942-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 7
# GFX950-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 15
# GFX950-NEXT: S_NOP 2
# GCN-NEXT: V_ACCVGPR_READ_B32_e64
name: dmfma16x16_write_agpr_valu_read
@ -1630,8 +1540,7 @@ body: |
...
# GCN-LABEL: name: dmfma16x16_write_agpr_valu_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 2
# GCN-NEXT: S_NOP 10
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
name: dmfma16x16_write_agpr_valu_write
body: |
@ -1840,9 +1749,8 @@ body: |
...
# GCN-LABEL: name: smfmac32x32x32_mfma_write_agpr_mfma_read_overlap
# GCN: V_SMFMAC
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 9
# GCN-NEXT: V_SMFMAC
name: smfmac32x32x32_mfma_write_agpr_mfma_read_overlap
body: |
@ -1959,8 +1867,7 @@ body: |
...
# GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_vm_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 1
# GCN-NEXT: S_NOP 9
# GCN-NEXT: BUFFER_STORE_DWORD
name: nonxdl_8pass_smfma16x16_write_vgpr_vm_read
body: |
@ -1970,8 +1877,7 @@ body: |
...
# GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_valu_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 1
# GCN-NEXT: S_NOP 9
# GCN-NEXT: V_MOV_B32
name: nonxdl_8pass_smfma16x16_write_vgpr_valu_read
body: |
@ -1981,8 +1887,7 @@ body: |
...
# GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_valu_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 1
# GCN-NEXT: S_NOP 9
# GCN-NEXT: V_MOV_B32
name: nonxdl_8pass_smfma16x16_write_vgpr_valu_write
body: |
@ -1992,8 +1897,7 @@ body: |
...
# GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_vm_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 1
# GCN-NEXT: BUFFER_STORE_DWORD
name: nonxdl_smfma32x32_write_vgpr_vm_read
@ -2004,8 +1908,7 @@ body: |
...
# GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_valu_read
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_MOV_B32
name: nonxdl_smfma32x32_write_vgpr_valu_read
@ -2016,8 +1919,7 @@ body: |
...
# GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_valu_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_MOV_B32
name: nonxdl_smfma32x32_write_vgpr_valu_write
@ -2109,9 +2011,8 @@ body: |
...
# GCN-LABEL: name: smfmac32x32_read_vgpr_srcc_valu_write
# GCN: V_SMFMAC
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: V_MOV_B32
name: smfmac32x32_read_vgpr_srcc_valu_write
body: |
@ -2121,8 +2022,7 @@ body: |
...
# GCN-LABEL: name: xdl_sgemm32x32_mfma_read_vgpr_srcc_valu_write
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 6
# GCN-NEXT: S_NOP 14
# GCN-NEXT: V_MOV_B32
name: xdl_sgemm32x32_mfma_read_vgpr_srcc_valu_write
body: |
@ -2337,9 +2237,8 @@ body: |
# 8 pass source
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 9
# GCN-NEXT: V_MFMA
name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc
body: |
@ -2353,9 +2252,8 @@ body: |
# 8 pass source
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: V_MFMA
name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
body: |
@ -2369,9 +2267,8 @@ body: |
# 8 pass source
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: V_MFMA
name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
body: |
@ -2385,8 +2282,7 @@ body: |
# 16 pass source
# GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GCN-NEXT: V_MFMA
@ -2403,8 +2299,7 @@ body: |
# 16 pass source
# GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
@ -2420,8 +2315,7 @@ body: |
# 16 pass source
# GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
@ -2450,8 +2344,7 @@ body: |
# 8 pass source
# GCN-LABEL: name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 1
# GCN-NEXT: S_NOP 9
# GCN-NEXT: V_MFMA
name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
body: |
@ -2464,8 +2357,7 @@ body: |
# 8 pass source
# GCN-LABEL: name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 1
# GCN-NEXT: S_NOP 9
# GCN-NEXT: V_MFMA
name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
body: |
@ -2477,9 +2369,8 @@ body: |
# 8 pass source
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcc
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 9
# GCN-NEXT: V_MFMA
name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcc
body: |
@ -2492,9 +2383,8 @@ body: |
# 8 pass source
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: V_MFMA
name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca
body: |
@ -2507,9 +2397,8 @@ body: |
# 8 pass source
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GFX942-NEXT: S_NOP 10
# GFX950-NEXT: S_NOP 11
# GCN-NEXT: V_MFMA
name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb
body: |
@ -2522,8 +2411,7 @@ body: |
# 16 pass source
# GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcc
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GCN-NEXT: V_MFMA
@ -2539,8 +2427,7 @@ body: |
# 16 pass source
# GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srca
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
@ -2557,8 +2444,7 @@ body: |
# 16 pass source
# GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcb
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 2
# GFX950-NEXT: S_NOP 3
# GCN-NEXT: V_MFMA
@ -2603,9 +2489,8 @@ body: |
...
# GCN-LABEL: name: xdl_8pass_mfma_write_agpr_smfmac_read_overlap_srcc
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GFX942-NEXT: S_NOP 8
# GFX950-NEXT: S_NOP 9
# GCN-NEXT: V_SMFMAC_
name: xdl_8pass_mfma_write_agpr_smfmac_read_overlap_srcc
body: |
@ -2617,8 +2502,7 @@ body: |
...
# GCN-LABEL: name: xdl_16pass_mfma_write_agpr_smfmac_read_overlap_srcc
# GCN: V_MFMA
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GFX942-NEXT: S_NOP 0
# GFX950-NEXT: S_NOP 1
# GCN-NEXT: V_SMFMAC_

View File

@ -15,8 +15,7 @@ body: |
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 1
; GCN-NEXT: S_NOP 9
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
@ -37,8 +36,7 @@ body: |
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 1, 1, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 1
; GCN-NEXT: S_NOP 9
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 1, 1, implicit $mode, implicit $exec
@ -59,8 +57,7 @@ body: |
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 1
; GCN-NEXT: S_NOP 9
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 0, implicit $mode, implicit $exec
@ -81,8 +78,7 @@ body: |
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 2, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 1
; GCN-NEXT: S_NOP 9
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 2, implicit $mode, implicit $exec
@ -163,8 +159,7 @@ body: |
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 15
; GCN-NEXT: S_NOP 1
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
@ -186,8 +181,7 @@ body: |
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 1
; GCN-NEXT: S_NOP 9
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
@ -208,8 +202,7 @@ body: |
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 15
; GCN-NEXT: S_NOP 1
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
@ -231,8 +224,7 @@ body: |
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 1
; GCN-NEXT: S_NOP 9
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
@ -253,8 +245,7 @@ body: |
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 7
; GCN-NEXT: S_NOP 3
; GCN-NEXT: S_NOP 11
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec

View File

@ -157,8 +157,7 @@ body: |
# GCN-LABEL: name: mfma_16x16_write_agpr_accvgpr_read
# GCN: V_MFMA_F32_16X16X1F32
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 1
# GCN-NEXT: S_NOP 9
# GCN-NEXT: V_ACCVGPR_READ_B32_e64
name: mfma_16x16_write_agpr_accvgpr_read
body: |
@ -170,8 +169,7 @@ body: |
# GCN-LABEL: name: mfma_32x32_write_agpr_accvgpr_read
# GCN: V_MFMA_F32_32X32X2F32
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 15
# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_ACCVGPR_READ_B32_e64
name: mfma_32x32_write_agpr_accvgpr_read
@ -208,8 +206,7 @@ body: |
# GCN-LABEL: name: mfma_32x32_write_agpr_accvgpr_write
# GCN: V_MFMA_F32_32X32X2F32
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 6
# GCN-NEXT: S_NOP 14
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
name: mfma_32x32_write_agpr_accvgpr_write
body: |
@ -244,8 +241,7 @@ body: |
# GCN-LABEL: name: mfma_32x32_read_srcc_accvgpr_write
# GCN: V_MFMA_F32_32X32X2F32
# GCN-NEXT: S_NOP 7
# GCN-NEXT: S_NOP 4
# GCN-NEXT: S_NOP 12
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
name: mfma_32x32_read_srcc_accvgpr_write
body: |

View File

@ -84,8 +84,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg)
; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
@ -227,8 +226,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg)
; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
@ -347,8 +345,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
@ -454,8 +451,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
@ -561,8 +557,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
@ -690,8 +685,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg)
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
@ -835,8 +829,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(
; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v6, a27
; GFX908-NEXT: v_accvgpr_read_b32 v5, a26
@ -977,8 +970,7 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0
; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
@ -1079,8 +1071,7 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg)
; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
; GFX908-NEXT: v_accvgpr_read_b32 v4, a26

View File

@ -54,8 +54,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: s_cbranch_scc1 .LBB0_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 5
; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
@ -148,8 +147,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 4
; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -208,8 +206,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 3
; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -288,8 +285,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX908-NEXT: s_cbranch_scc1 .LBB1_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 5
; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
@ -383,8 +379,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 4
; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -444,8 +439,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 3
; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -518,8 +512,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: s_cbranch_scc1 .LBB2_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 5
; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
@ -612,8 +605,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 4
; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -672,8 +664,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 3
; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -783,8 +774,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
; GFX908-NEXT: s_cbranch_scc1 .LBB3_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 5
; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
@ -909,8 +899,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 4
; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -1001,8 +990,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 3
; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -1075,8 +1063,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: s_cbranch_scc1 .LBB4_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 5
; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
@ -1170,8 +1157,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 4
; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -1231,8 +1217,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 3
; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -1344,8 +1329,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX908-NEXT: s_cbranch_scc1 .LBB5_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 5
; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
@ -1441,8 +1425,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 4
; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -1504,8 +1487,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 3
; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -1614,8 +1596,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX908-NEXT: s_cbranch_scc1 .LBB6_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 5
; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
@ -1712,8 +1693,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 4
; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -1776,8 +1756,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 3
; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -1856,8 +1835,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
; GFX908-NEXT: s_cbranch_scc1 .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 5
; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
@ -1919,8 +1897,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 4
; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -1948,8 +1925,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 3
; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -2019,8 +1995,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: s_mov_b32 s0, 16
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
; GFX908-NEXT: s_nop 1
@ -2065,8 +2040,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: s_cbranch_scc1 .LBB8_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 5
; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
@ -2118,8 +2092,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
@ -2163,8 +2136,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 4
; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -2182,8 +2154,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
@ -2227,8 +2198,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 3
; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -2349,8 +2319,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX908-NEXT: s_cbranch_scc1 .LBB9_1
; GFX908-NEXT: ; %bb.4: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 2
; GFX908-NEXT: s_nop 10
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
@ -2453,8 +2422,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: s_nop 9
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@ -2523,8 +2491,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: s_nop 8
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80

View File

@ -93,8 +93,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY908-NEXT: s_nop 0
; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v3, v0, a[0:31]
; GREEDY908-NEXT: s_nop 7
; GREEDY908-NEXT: s_nop 7
; GREEDY908-NEXT: s_nop 15
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a32
; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a61
@ -158,8 +157,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v5
; GREEDY908-NEXT: s_nop 0
; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; GREEDY908-NEXT: s_nop 7
; GREEDY908-NEXT: s_nop 7
; GREEDY908-NEXT: s_nop 15
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a27
; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a26
@ -263,8 +261,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY90A-NEXT: s_nop 1
; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
; GREEDY90A-NEXT: s_nop 7
; GREEDY90A-NEXT: s_nop 7
; GREEDY90A-NEXT: s_nop 15
; GREEDY90A-NEXT: s_nop 2
; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a32
; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a33
@ -298,8 +295,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY90A-NEXT: v_accvgpr_mov_b32 a31, a61
; GREEDY90A-NEXT: s_nop 1
; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GREEDY90A-NEXT: s_nop 7
; GREEDY90A-NEXT: s_nop 7
; GREEDY90A-NEXT: s_nop 15
; GREEDY90A-NEXT: s_nop 2
; GREEDY90A-NEXT: global_store_dwordx4 v2, a[24:27], s[34:35] offset:96
; GREEDY90A-NEXT: global_store_dwordx4 v2, a[28:31], s[34:35] offset:112
@ -356,8 +352,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY942-NEXT: s_nop 1
; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[32:63], v0, v1, a[0:31]
; GREEDY942-NEXT: s_nop 7
; GREEDY942-NEXT: s_nop 7
; GREEDY942-NEXT: s_nop 15
; GREEDY942-NEXT: s_nop 1
; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a32
; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a33
@ -391,8 +386,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY942-NEXT: v_accvgpr_mov_b32 a31, a61
; GREEDY942-NEXT: s_nop 1
; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; GREEDY942-NEXT: s_nop 7
; GREEDY942-NEXT: s_nop 7
; GREEDY942-NEXT: s_nop 15
; GREEDY942-NEXT: s_nop 1
; GREEDY942-NEXT: global_store_dwordx4 v2, a[24:27], s[34:35] offset:96
; GREEDY942-NEXT: global_store_dwordx4 v2, a[28:31], s[34:35] offset:112
@ -448,8 +442,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY90A-GISEL-NEXT: s_nop 1
; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
; GREEDY90A-GISEL-NEXT: s_nop 7
; GREEDY90A-GISEL-NEXT: s_nop 7
; GREEDY90A-GISEL-NEXT: s_nop 15
; GREEDY90A-GISEL-NEXT: s_nop 2
; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a2, a32
; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a3, a33
@ -484,8 +477,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY90A-GISEL-NEXT: s_nop 1
; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GREEDY90A-GISEL-NEXT: s_nop 7
; GREEDY90A-GISEL-NEXT: s_nop 7
; GREEDY90A-GISEL-NEXT: s_nop 15
; GREEDY90A-GISEL-NEXT: s_nop 1
; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
@ -542,8 +534,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-NEXT: s_nop 1
; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63]
; FAST90A-NEXT: s_nop 7
; FAST90A-NEXT: s_nop 7
; FAST90A-NEXT: s_nop 15
; FAST90A-NEXT: s_nop 2
; FAST90A-NEXT: v_accvgpr_read_b32 v3, a29
; FAST90A-NEXT: v_accvgpr_read_b32 v4, a28
@ -609,8 +600,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-NEXT: v_accvgpr_write_b32 a31, v3
; FAST90A-NEXT: s_nop 1
; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
; FAST90A-NEXT: s_nop 7
; FAST90A-NEXT: s_nop 7
; FAST90A-NEXT: s_nop 15
; FAST90A-NEXT: s_nop 2
; FAST90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; FAST90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
@ -676,8 +666,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
; GREEDY908-NEXT: s_nop 7
; GREEDY908-NEXT: s_nop 0
; GREEDY908-NEXT: s_nop 8
; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a18
; GREEDY908-NEXT: s_nop 0
@ -685,8 +674,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3
; GREEDY908-NEXT: s_nop 0
; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; GREEDY908-NEXT: s_nop 7
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: s_nop 9
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15
; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13
@ -744,14 +732,12 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY90A-NEXT: s_nop 1
; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
; GREEDY90A-NEXT: s_nop 7
; GREEDY90A-NEXT: s_nop 1
; GREEDY90A-NEXT: s_nop 9
; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18
; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19
; GREEDY90A-NEXT: s_nop 1
; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; GREEDY90A-NEXT: s_nop 7
; GREEDY90A-NEXT: s_nop 2
; GREEDY90A-NEXT: s_nop 10
; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
@ -786,14 +772,12 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY942-NEXT: s_nop 1
; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33]
; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33]
; GREEDY942-NEXT: s_nop 7
; GREEDY942-NEXT: s_nop 0
; GREEDY942-NEXT: s_nop 8
; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18
; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19
; GREEDY942-NEXT: s_nop 1
; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
; GREEDY942-NEXT: s_nop 7
; GREEDY942-NEXT: s_nop 1
; GREEDY942-NEXT: s_nop 9
; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
@ -827,8 +811,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY90A-GISEL-NEXT: s_nop 1
; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15]
; GREEDY90A-GISEL-NEXT: s_nop 7
; GREEDY90A-GISEL-NEXT: s_nop 2
; GREEDY90A-GISEL-NEXT: s_nop 10
; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a2, a16
; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a3, a17
; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a4, a18
@ -846,8 +829,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY90A-GISEL-NEXT: s_nop 1
; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GREEDY90A-GISEL-NEXT: s_nop 7
; GREEDY90A-GISEL-NEXT: s_nop 1
; GREEDY90A-GISEL-NEXT: s_nop 9
; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@ -882,8 +864,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-NEXT: s_nop 1
; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15]
; FAST90A-NEXT: s_nop 7
; FAST90A-NEXT: s_nop 2
; FAST90A-NEXT: s_nop 10
; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16
; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17
; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a18
@ -900,8 +881,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a29
; FAST90A-NEXT: s_nop 1
; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
; FAST90A-NEXT: s_nop 7
; FAST90A-NEXT: s_nop 2
; FAST90A-NEXT: s_nop 10
; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16

View File

@ -372,14 +372,12 @@ body: |
;
; gfx908-PAD75-LABEL: name: mfma_padding_16_pass
; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; gfx908-PAD75-NEXT: S_NOP 7
; gfx908-PAD75-NEXT: S_NOP 3
; gfx908-PAD75-NEXT: S_NOP 11
; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
;
; gfx908-PAD100-LABEL: name: mfma_padding_16_pass
; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; gfx908-PAD100-NEXT: S_NOP 7
; gfx908-PAD100-NEXT: S_NOP 7
; gfx908-PAD100-NEXT: S_NOP 15
; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
;
; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass
@ -393,8 +391,7 @@ body: |
;
; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass
; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; gfx90a-PAD100-NEXT: S_NOP 7
; gfx90a-PAD100-NEXT: S_NOP 7
; gfx90a-PAD100-NEXT: S_NOP 15
; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
;
; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass
@ -408,8 +405,7 @@ body: |
;
; gfx942-PAD100-LABEL: name: mfma_padding_16_pass
; gfx942-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; gfx942-PAD100-NEXT: S_NOP 7
; gfx942-PAD100-NEXT: S_NOP 7
; gfx942-PAD100-NEXT: S_NOP 15
; gfx942-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
@ -459,8 +455,7 @@ body: |
; gfx908-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; gfx908-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; gfx908-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; gfx908-PAD100-NEXT: S_NOP 7
; gfx908-PAD100-NEXT: S_NOP 3
; gfx908-PAD100-NEXT: S_NOP 11
; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
;
; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu
@ -486,8 +481,7 @@ body: |
; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; gfx90a-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; gfx90a-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; gfx90a-PAD100-NEXT: S_NOP 7
; gfx90a-PAD100-NEXT: S_NOP 3
; gfx90a-PAD100-NEXT: S_NOP 11
; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
;
; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu
@ -513,8 +507,7 @@ body: |
; gfx942-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; gfx942-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; gfx942-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; gfx942-PAD100-NEXT: S_NOP 7
; gfx942-PAD100-NEXT: S_NOP 3
; gfx942-PAD100-NEXT: S_NOP 11
; gfx942-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
@ -887,8 +880,7 @@ body: |
; gfx908-PAD75-NEXT: {{ $}}
; gfx908-PAD75-NEXT: bb.2:
; gfx908-PAD75-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; gfx908-PAD75-NEXT: S_NOP 7
; gfx908-PAD75-NEXT: S_NOP 1
; gfx908-PAD75-NEXT: S_NOP 9
; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
;
; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_2_preds
@ -905,8 +897,7 @@ body: |
; gfx908-PAD100-NEXT: {{ $}}
; gfx908-PAD100-NEXT: bb.2:
; gfx908-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; gfx908-PAD100-NEXT: S_NOP 7
; gfx908-PAD100-NEXT: S_NOP 5
; gfx908-PAD100-NEXT: S_NOP 13
; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
;
; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds
@ -956,8 +947,7 @@ body: |
; gfx90a-PAD100-NEXT: {{ $}}
; gfx90a-PAD100-NEXT: bb.2:
; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; gfx90a-PAD100-NEXT: S_NOP 7
; gfx90a-PAD100-NEXT: S_NOP 5
; gfx90a-PAD100-NEXT: S_NOP 13
; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
;
; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds
@ -1007,8 +997,7 @@ body: |
; gfx942-PAD100-NEXT: {{ $}}
; gfx942-PAD100-NEXT: bb.2:
; gfx942-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; gfx942-PAD100-NEXT: S_NOP 7
; gfx942-PAD100-NEXT: S_NOP 5
; gfx942-PAD100-NEXT: S_NOP 13
; gfx942-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
bb.0:
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec

View File

@ -33,8 +33,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) %
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: ; implicit-def: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
; CHECK-NEXT: .LBB0_3: ; %if
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 15
; CHECK-NEXT: global_load_dwordx4 a[28:31], v32, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v32, s[0:1] offset:96
; CHECK-NEXT: global_load_dwordx4 a[20:23], v32, s[0:1] offset:80
@ -98,8 +97,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi_loop(ptr addrspace
; CHECK-NEXT: .LBB1_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 15
; CHECK-NEXT: v_mov_b64_e32 v[62:63], v[30:31]
; CHECK-NEXT: v_mov_b64_e32 v[60:61], v[28:29]
; CHECK-NEXT: v_mov_b64_e32 v[58:59], v[26:27]

View File

@ -60,8 +60,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a0, a1, v[0:31]
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 15
; CHECK-NEXT: s_nop 1
; CHECK-NEXT: v_mov_b32_e32 v2, v32
; CHECK-NEXT: v_mov_b32_e32 v3, v33
@ -96,8 +95,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
; CHECK-NEXT: v_mov_b32_e32 v32, 0
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[0:31]
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 15
; CHECK-NEXT: s_nop 1
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
@ -143,8 +141,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; CHECK-NEXT: v_mov_b32_e32 v32, 0
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 15
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
@ -178,8 +175,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2(
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; CHECK-NEXT: v_mov_b32_e32 v32, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 15
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
@ -212,8 +208,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2(
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; CHECK-NEXT: v_mov_b32_e32 v32, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 15
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
@ -351,8 +346,7 @@ define void @test_rewrite_mfma_subreg_extract2(float %arg0, float %arg1, ptr add
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 15
; CHECK-NEXT: s_nop 1
; CHECK-NEXT: v_accvgpr_mov_b32 a0, a1
; CHECK-NEXT: v_accvgpr_mov_b32 a1, a2
@ -717,8 +711,7 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_chain(p
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v34, a[0:31]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 15
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
@ -777,8 +770,7 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a
; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1]
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: s_nop 8
; CHECK-NEXT: global_store_dwordx2 v[2:3], a[0:1], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]

View File

@ -258,8 +258,7 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
; GFX908-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v4, v4, a[0:31]
; GFX908-NEXT: v_mov_b32_e32 v0, 0
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 5
; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_write_b32 a1, v5
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
@ -339,8 +338,7 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v2, a[0:31]
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART

View File

@ -958,8 +958,7 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[40:41] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[40:41] offset:96