[AMDGPU] Use larger immediate values in S_NOP (#158990)
The S_NOP instruction has an immediate operand which is one less than the number of cycles to delay for. The maximum value that may be encoded in this field was increased in GFX8 and again in GFX12.
This commit is contained in:
parent
a42aac5f83
commit
eeced0d073
@ -1839,6 +1839,16 @@ public:
|
||||
/// \returns true if the subtarget requires a wait for xcnt before atomic
|
||||
/// flat/global stores & rmw.
|
||||
bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
|
||||
|
||||
/// \returns the number of significant bits in the immediate field of the
|
||||
/// S_NOP instruction.
|
||||
unsigned getSNopBits() const {
|
||||
if (getGeneration() >= AMDGPUSubtarget::GFX12)
|
||||
return 7;
|
||||
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
|
||||
return 4;
|
||||
return 3;
|
||||
}
|
||||
};
|
||||
|
||||
class GCNUserSGPRUsageInfo {
|
||||
|
||||
@ -1932,8 +1932,9 @@ void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator MI,
|
||||
unsigned Quantity) const {
|
||||
DebugLoc DL = MBB.findDebugLoc(MI);
|
||||
unsigned MaxSNopCount = 1u << ST.getSNopBits();
|
||||
while (Quantity > 0) {
|
||||
unsigned Arg = std::min(Quantity, 8u);
|
||||
unsigned Arg = std::min(Quantity, MaxSNopCount);
|
||||
Quantity -= Arg;
|
||||
BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
|
||||
}
|
||||
|
||||
@ -58,8 +58,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 15
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
|
||||
@ -109,8 +108,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: s_nop 9
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
@ -185,8 +183,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 15
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
@ -220,8 +217,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: s_nop 9
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
@ -277,8 +273,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 15
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
|
||||
@ -302,8 +297,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %
|
||||
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
|
||||
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 15
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
@ -336,8 +330,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 15
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
@ -369,8 +362,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 15
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
|
||||
@ -9,8 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
|
||||
; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
|
||||
; GCN-NOT: v_accvgpr_write
|
||||
; GCN: v_mfma_f32_32x32x1f32
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 15
|
||||
; GCN-NEXT: s_nop 2
|
||||
; GCN-NOT: v_accvgpr_read
|
||||
; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
|
||||
@ -28,8 +27,7 @@ bb:
|
||||
; GCN: global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}]
|
||||
; GCN-NOT: v_accvgpr_read
|
||||
; GCN: v_mfma_f32_32x32x1f32 a[[[N:[0-9]+]]:
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 15
|
||||
; GCN-NEXT: s_nop 2
|
||||
; GCN-NOT: v_accvgpr_read
|
||||
; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}]
|
||||
@ -80,8 +78,7 @@ bb:
|
||||
; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
|
||||
; GCN-COUNT-32: v_accvgpr_write
|
||||
; GCN: v_mfma_f32_32x32x1f32
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 15
|
||||
; GCN-NEXT: s_nop 2
|
||||
; GCN-NOT: v_accvgpr_read
|
||||
; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}]
|
||||
|
||||
@ -63,8 +63,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
|
||||
; GFX908-NEXT: v_accvgpr_write_b32 a16, v39
|
||||
; GFX908-NEXT: s_nop 0
|
||||
; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: s_nop 9
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v39, a0 ; Reload Reuse
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse
|
||||
@ -181,8 +180,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
|
||||
; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: s_nop 10
|
||||
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
@ -487,8 +485,7 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 {
|
||||
; GFX90A-NEXT: ; copy
|
||||
; GFX90A-NEXT: ;;#ASMEND
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a32, v35 ; Reload Reuse
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: s_nop 9
|
||||
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2
|
||||
; GFX90A-NEXT: ;;#ASMSTART
|
||||
; GFX90A-NEXT: ; use a3 v[0:31]
|
||||
@ -965,8 +962,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
|
||||
; GFX908-NEXT: v_accvgpr_write_b32 a16, v39
|
||||
; GFX908-NEXT: s_nop 0
|
||||
; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: s_nop 9
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v39, a0 ; Reload Reuse
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse
|
||||
@ -1084,8 +1080,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
|
||||
; GFX90A-NEXT: v_accvgpr_read_b32 v34, a32 ; Reload Reuse
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: s_nop 10
|
||||
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
|
||||
@ -63,52 +63,41 @@ body: |
|
||||
; GCN16-NEXT: successors: %bb.1(0x80000000)
|
||||
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
|
||||
; GCN16-NEXT: {{ $}}
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 15
|
||||
; GCN16-NEXT: S_BRANCH %bb.1
|
||||
; GCN16-NEXT: {{ $}}
|
||||
; GCN16-NEXT: bb.1:
|
||||
; GCN16-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
|
||||
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
|
||||
; GCN16-NEXT: {{ $}}
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 15
|
||||
; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 15
|
||||
; GCN16-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
|
||||
; GCN16-NEXT: {{ $}}
|
||||
; GCN16-NEXT: bb.2:
|
||||
; GCN16-NEXT: successors: %bb.3(0x80000000)
|
||||
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
|
||||
; GCN16-NEXT: {{ $}}
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 15
|
||||
; GCN16-NEXT: SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 15
|
||||
; GCN16-NEXT: S_NOP 0
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 15
|
||||
; GCN16-NEXT: renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 15
|
||||
; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 15
|
||||
; GCN16-NEXT: S_BRANCH %bb.3
|
||||
; GCN16-NEXT: {{ $}}
|
||||
; GCN16-NEXT: bb.3:
|
||||
; GCN16-NEXT: liveins: $sgpr10_sgpr11
|
||||
; GCN16-NEXT: {{ $}}
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 15
|
||||
; GCN16-NEXT: $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 15
|
||||
; GCN16-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 7
|
||||
; GCN16-NEXT: S_NOP 15
|
||||
; GCN16-NEXT: SI_RETURN
|
||||
bb.0:
|
||||
liveins: $sgpr6, $sgpr10_sgpr11
|
||||
|
||||
@ -87,8 +87,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, 2
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 15
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -191,8 +190,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
|
||||
@ -256,8 +254,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, 2
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: s_nop 9
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
|
||||
@ -308,8 +305,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: s_nop 9
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
@ -424,8 +420,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, 2
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 15
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
@ -476,8 +471,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
@ -513,8 +507,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
|
||||
; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
|
||||
; GFX908-NEXT: s_nop 0
|
||||
; GFX908-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: s_nop 9
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -538,8 +531,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: s_nop 10
|
||||
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
bb:
|
||||
|
||||
@ -59,8 +59,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
|
||||
@ -117,8 +116,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 2
|
||||
; GFX942-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
|
||||
@ -175,8 +173,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v31, s15
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x4bf16_1k v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 15
|
||||
; GFX90A-VGPR-NEXT: s_nop 2
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
|
||||
@ -233,8 +230,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 2
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
|
||||
@ -283,8 +279,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: s_nop 10
|
||||
; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
|
||||
; GFX90A-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
|
||||
@ -319,8 +314,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 2
|
||||
; GFX942-NEXT: s_nop 10
|
||||
; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
|
||||
; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
|
||||
; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
|
||||
@ -347,8 +341,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x4bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 2
|
||||
; GFX90A-VGPR-NEXT: s_nop 10
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
|
||||
@ -375,8 +368,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 2
|
||||
; GFX942-VGPR-NEXT: s_nop 10
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
|
||||
@ -505,8 +497,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
|
||||
@ -542,8 +533,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x8_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 2
|
||||
; GFX942-NEXT: s_nop 10
|
||||
; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
|
||||
; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
|
||||
; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
|
||||
@ -570,8 +560,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x8bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 15
|
||||
; GFX90A-VGPR-NEXT: s_nop 2
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
|
||||
@ -599,8 +588,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 2
|
||||
; GFX942-VGPR-NEXT: s_nop 10
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
|
||||
@ -632,8 +620,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: s_nop 10
|
||||
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
;
|
||||
@ -671,8 +658,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 2
|
||||
; GFX90A-VGPR-NEXT: s_nop 10
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
|
||||
; GFX90A-VGPR-NEXT: s_endpgm
|
||||
;
|
||||
@ -795,8 +781,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
|
||||
@ -823,8 +808,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
|
||||
@ -847,8 +831,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 15
|
||||
; GFX90A-VGPR-NEXT: s_nop 0
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
|
||||
@ -871,8 +854,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
|
||||
@ -896,8 +878,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -914,8 +895,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 0
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -932,8 +912,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 0
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 15
|
||||
; GFX90A-VGPR-NEXT: s_nop 0
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
@ -950,8 +929,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 0
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
@ -975,8 +953,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -993,8 +970,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1011,8 +987,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 15
|
||||
; GFX90A-VGPR-NEXT: s_nop 0
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
@ -1029,8 +1004,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
@ -1054,8 +1028,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 1.0
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1072,8 +1045,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 1.0
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1090,8 +1062,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 1.0
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 15
|
||||
; GFX90A-VGPR-NEXT: s_nop 0
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
@ -1108,8 +1079,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 1.0
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
@ -1133,8 +1103,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1.0
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1151,8 +1120,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1.0
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1169,8 +1137,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1.0
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 15
|
||||
; GFX90A-VGPR-NEXT: s_nop 0
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
@ -1187,8 +1154,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1.0
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
@ -1212,8 +1178,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 64
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1230,8 +1195,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 64
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1248,8 +1212,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 64
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 15
|
||||
; GFX90A-VGPR-NEXT: s_nop 0
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
@ -1266,8 +1229,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 64
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
@ -1299,8 +1261,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1325,8 +1286,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1354,8 +1314,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 15
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
|
||||
@ -1383,8 +1342,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
|
||||
@ -1416,8 +1374,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1442,8 +1399,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1468,8 +1424,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 15
|
||||
; GFX90A-VGPR-NEXT: s_nop 0
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
@ -1494,8 +1449,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
@ -1527,8 +1481,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1553,8 +1506,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1579,8 +1531,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 15
|
||||
; GFX90A-VGPR-NEXT: s_nop 0
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
@ -1605,8 +1556,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
@ -1639,8 +1589,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1666,8 +1615,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1695,8 +1643,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 15
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
|
||||
@ -1724,8 +1671,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
|
||||
@ -1757,8 +1703,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1784,8 +1729,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
@ -1813,8 +1757,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 15
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
|
||||
@ -1842,8 +1785,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -178,8 +178,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a15, v23
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -223,8 +222,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0,
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a15, v23
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -394,8 +392,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 2
|
||||
; GCN-NEXT: s_nop 10
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -428,8 +425,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 2
|
||||
; GCN-NEXT: s_nop 10
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
|
||||
@ -479,8 +479,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: s_nop 8
|
||||
; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
|
||||
@ -598,8 +597,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: s_nop 0
|
||||
; VGPRRC-NEXT: s_nop 8
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
|
||||
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
|
||||
@ -864,8 +862,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: s_nop 8
|
||||
; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
|
||||
@ -983,8 +980,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: s_nop 0
|
||||
; VGPRRC-NEXT: s_nop 8
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
|
||||
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
|
||||
@ -1169,8 +1165,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a15, v23
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1210,8 +1205,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
|
||||
; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
|
||||
; HEURRC-NEXT: s_nop 1
|
||||
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
|
||||
; HEURRC-NEXT: s_nop 7
|
||||
; HEURRC-NEXT: s_nop 3
|
||||
; HEURRC-NEXT: s_nop 11
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1234,8 +1228,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
|
||||
; VGPRRC: ; %bb.0:
|
||||
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23]
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: s_nop 3
|
||||
; VGPRRC-NEXT: s_nop 11
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v0, v8
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v1, v9
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v2, v10
|
||||
@ -1342,8 +1335,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a15, v23
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1383,8 +1375,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
|
||||
; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
|
||||
; HEURRC-NEXT: s_nop 1
|
||||
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
|
||||
; HEURRC-NEXT: s_nop 7
|
||||
; HEURRC-NEXT: s_nop 3
|
||||
; HEURRC-NEXT: s_nop 11
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1407,8 +1398,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
|
||||
; VGPRRC: ; %bb.0:
|
||||
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: s_nop 3
|
||||
; VGPRRC-NEXT: s_nop 11
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v0, v8
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v1, v9
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v2, v10
|
||||
@ -2199,8 +2189,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
; SDAG-NEXT: s_nop 10
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -2228,8 +2217,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
@ -2257,8 +2245,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
|
||||
; HEURRC-NEXT: s_nop 1
|
||||
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v16, 0
|
||||
; HEURRC-NEXT: s_nop 7
|
||||
; HEURRC-NEXT: s_nop 2
|
||||
; HEURRC-NEXT: s_nop 10
|
||||
; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -2286,8 +2273,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
|
||||
; VGPRRC-NEXT: s_nop 1
|
||||
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: s_nop 2
|
||||
; VGPRRC-NEXT: s_nop 10
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -2384,8 +2370,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
; SDAG-NEXT: s_nop 10
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -2413,8 +2398,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
@ -2442,8 +2426,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
|
||||
; HEURRC-NEXT: s_nop 1
|
||||
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v16, 0
|
||||
; HEURRC-NEXT: s_nop 7
|
||||
; HEURRC-NEXT: s_nop 2
|
||||
; HEURRC-NEXT: s_nop 10
|
||||
; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -2471,8 +2454,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
|
||||
; VGPRRC-NEXT: s_nop 1
|
||||
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: s_nop 2
|
||||
; VGPRRC-NEXT: s_nop 10
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -3083,8 +3065,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: s_nop 8
|
||||
; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
|
||||
@ -3205,8 +3186,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
|
||||
; VGPRRC-NEXT: s_nop 1
|
||||
; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31]
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: s_nop 3
|
||||
; VGPRRC-NEXT: s_nop 11
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
|
||||
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
|
||||
@ -3497,8 +3477,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: s_nop 8
|
||||
; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
|
||||
@ -3619,8 +3598,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
|
||||
; VGPRRC-NEXT: s_nop 1
|
||||
; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: s_nop 3
|
||||
; VGPRRC-NEXT: s_nop 11
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
|
||||
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
|
||||
@ -3827,8 +3805,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a15, v23
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -3868,8 +3845,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
|
||||
; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
|
||||
; HEURRC-NEXT: s_nop 1
|
||||
; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
|
||||
; HEURRC-NEXT: s_nop 7
|
||||
; HEURRC-NEXT: s_nop 3
|
||||
; HEURRC-NEXT: s_nop 11
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -3892,8 +3868,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
|
||||
; VGPRRC: ; %bb.0:
|
||||
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23]
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: s_nop 3
|
||||
; VGPRRC-NEXT: s_nop 11
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v0, v8
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v1, v9
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v2, v10
|
||||
@ -4000,8 +3975,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a15, v23
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -4041,8 +4015,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
|
||||
; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
|
||||
; HEURRC-NEXT: s_nop 1
|
||||
; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
|
||||
; HEURRC-NEXT: s_nop 7
|
||||
; HEURRC-NEXT: s_nop 3
|
||||
; HEURRC-NEXT: s_nop 11
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -4065,8 +4038,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
|
||||
; VGPRRC: ; %bb.0:
|
||||
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: s_nop 3
|
||||
; VGPRRC-NEXT: s_nop 11
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v0, v8
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v1, v9
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v2, v10
|
||||
@ -4932,8 +4904,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
; SDAG-NEXT: s_nop 10
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -4961,8 +4932,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
@ -4995,8 +4965,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
|
||||
; HEURRC-NEXT: s_nop 1
|
||||
; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v16, 0
|
||||
; HEURRC-NEXT: s_nop 7
|
||||
; HEURRC-NEXT: s_nop 2
|
||||
; HEURRC-NEXT: s_nop 10
|
||||
; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -5029,8 +4998,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
|
||||
; VGPRRC-NEXT: s_nop 1
|
||||
; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: s_nop 2
|
||||
; VGPRRC-NEXT: s_nop 10
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -5142,8 +5110,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
; SDAG-NEXT: s_nop 10
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -5171,8 +5138,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
@ -5205,8 +5171,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
|
||||
; HEURRC-NEXT: s_nop 1
|
||||
; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v16, 0
|
||||
; HEURRC-NEXT: s_nop 7
|
||||
; HEURRC-NEXT: s_nop 2
|
||||
; HEURRC-NEXT: s_nop 10
|
||||
; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -5239,8 +5204,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
|
||||
; VGPRRC-NEXT: s_nop 1
|
||||
; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: s_nop 2
|
||||
; VGPRRC-NEXT: s_nop 10
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
|
||||
@ -50,8 +50,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, 2
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 15
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
|
||||
@ -103,8 +102,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
@ -138,8 +136,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 {
|
||||
; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
|
||||
; GFX908-NEXT: s_nop 0
|
||||
; GFX908-NEXT: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: s_nop 9
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -163,8 +160,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_i32_16x16x16i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: s_nop 10
|
||||
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
bb:
|
||||
|
||||
@ -97,8 +97,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 15
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -233,8 +232,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 15
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -337,8 +335,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
|
||||
@ -394,8 +391,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
|
||||
@ -451,8 +447,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
|
||||
@ -514,8 +509,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: s_nop 9
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
|
||||
@ -582,8 +576,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: s_nop 9
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
|
||||
@ -634,8 +627,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: s_nop 9
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
@ -669,8 +661,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: s_nop 8
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
@ -696,8 +687,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: s_nop 8
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
|
||||
@ -872,8 +862,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 15
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
@ -940,8 +929,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 15
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
@ -992,8 +980,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
@ -1028,8 +1015,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
@ -1056,8 +1042,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x2_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
|
||||
@ -1091,8 +1076,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5
|
||||
; NOLIT-SRCC-NEXT: s_nop 0
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: s_nop 9
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1120,8 +1104,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
|
||||
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5
|
||||
; LIT-SRCC-NEXT: s_nop 0
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: s_nop 9
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1145,8 +1128,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: s_nop 10
|
||||
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
;
|
||||
@ -1165,8 +1147,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_f32_16x16x4_f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: s_nop 9
|
||||
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
|
||||
; GFX942-NEXT: s_endpgm
|
||||
;
|
||||
@ -1183,8 +1164,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: s_nop 9
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
|
||||
; GFX942-VGPR-NEXT: s_endpgm
|
||||
bb:
|
||||
@ -1275,8 +1255,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s3
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 15
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -1415,8 +1394,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s3
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 15
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -1523,8 +1501,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
|
||||
@ -1584,8 +1561,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 2
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
|
||||
@ -1645,8 +1621,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v37, s3
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_f16 v[0:31], v[34:35], v[36:37], v[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 2
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[36:37] offset:96
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[36:37] offset:112
|
||||
@ -1714,8 +1689,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s23
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: s_nop 9
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
|
||||
@ -1785,8 +1759,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s23
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: s_nop 9
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
|
||||
@ -1840,8 +1813,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: s_nop 9
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
@ -1878,8 +1850,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_f32_16x16x4_4b_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: s_nop 9
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
@ -1908,8 +1879,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: s_nop 9
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
|
||||
@ -2108,8 +2078,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s23
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 15
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
@ -2179,8 +2148,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s23
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 15
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
@ -2234,8 +2202,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
@ -2273,8 +2240,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: s_nop 9
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
@ -2303,8 +2269,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: s_nop 9
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
|
||||
@ -2343,8 +2308,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s7
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: s_nop 9
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -2375,8 +2339,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s7
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: s_nop 9
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -2403,8 +2366,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: s_nop 10
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
;
|
||||
@ -2536,8 +2498,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 15
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26
|
||||
@ -2658,8 +2619,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 15
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26
|
||||
@ -2748,8 +2708,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
|
||||
@ -2805,8 +2764,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_i32_32x32x4_2b_i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 2
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
|
||||
@ -2862,8 +2820,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_i32_32x32x4_2b_i8 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 2
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
|
||||
@ -2925,8 +2882,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: s_nop 9
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
|
||||
@ -2993,8 +2949,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: s_nop 9
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
|
||||
@ -3045,8 +3000,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: s_nop 9
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
@ -3080,8 +3034,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: s_nop 9
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
@ -3107,8 +3060,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: s_nop 9
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
|
||||
@ -3145,8 +3097,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
|
||||
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
|
||||
; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: s_nop 9
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
|
||||
@ -3177,8 +3128,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
|
||||
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
|
||||
; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: s_nop 9
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
|
||||
@ -3211,8 +3161,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
|
||||
; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: s_nop 8
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
@ -3228,8 +3177,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
|
||||
; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: s_nop 8
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
@ -3244,8 +3192,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: s_nop 9
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -3645,8 +3592,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
|
||||
; NOLIT-SRCC-NEXT: s_nop 0
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 15
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -3782,8 +3728,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
|
||||
; LIT-SRCC-NEXT: s_nop 0
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 15
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -3887,8 +3832,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
|
||||
@ -3945,8 +3889,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
|
||||
@ -4003,8 +3946,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
|
||||
@ -4068,8 +4010,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: s_nop 9
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
|
||||
@ -4136,8 +4077,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: s_nop 9
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
|
||||
@ -4188,8 +4128,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
|
||||
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
|
||||
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: s_nop 9
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
@ -4224,8 +4163,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
|
||||
; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
|
||||
; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: s_nop 8
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
@ -4252,8 +4190,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: s_nop 8
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
|
||||
@ -4502,8 +4439,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
|
||||
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: s_nop 9
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
|
||||
@ -4541,8 +4477,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
|
||||
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v8, 0
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: s_nop 9
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
|
||||
@ -4578,8 +4513,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
|
||||
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: s_nop 8
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
@ -4610,8 +4544,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0
|
||||
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: s_nop 8
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -4649,8 +4582,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
|
||||
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15]
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 15
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
@ -4691,8 +4623,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
|
||||
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v13, 0
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 15
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
@ -4730,8 +4661,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
|
||||
@ -4750,8 +4680,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], 1.0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: s_nop 8
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
@ -4768,8 +4697,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0
|
||||
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: s_nop 9
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -4821,8 +4749,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
|
||||
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 15
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
|
||||
@ -4889,8 +4816,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
|
||||
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v14, 0
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 15
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
|
||||
@ -4948,8 +4874,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
@ -4970,8 +4895,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -4990,8 +4914,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0
|
||||
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
|
||||
@ -5131,8 +5054,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
|
||||
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: s_nop 9
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
|
||||
@ -5186,8 +5108,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
|
||||
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: s_nop 9
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
|
||||
@ -5242,8 +5163,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: s_nop 9
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
@ -5274,8 +5194,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v2, a[0:15]
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: s_nop 8
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
@ -5304,8 +5223,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v15, v[0:15]
|
||||
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: s_nop 8
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
@ -5357,8 +5275,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
|
||||
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 15
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
|
||||
@ -5457,8 +5374,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
|
||||
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 15
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
|
||||
@ -5558,8 +5474,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
@ -5611,8 +5526,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
@ -5679,8 +5593,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
|
||||
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
|
||||
@ -5965,8 +5878,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
|
||||
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 7
|
||||
; NOLIT-SRCC-NEXT: s_nop 15
|
||||
; NOLIT-SRCC-NEXT: s_nop 1
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -6061,8 +5973,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
|
||||
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 7
|
||||
; LIT-SRCC-NEXT: s_nop 15
|
||||
; LIT-SRCC-NEXT: s_nop 1
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -6125,8 +6036,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
|
||||
; GFX90A-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
@ -6156,8 +6066,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
|
||||
; GFX942-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
@ -6187,8 +6096,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
|
||||
; GFX942-VGPR-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
|
||||
; GFX942-VGPR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 15
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
|
||||
|
||||
@ -23,8 +23,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -47,8 +46,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -71,8 +69,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -95,8 +92,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -119,8 +115,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -143,8 +138,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -167,8 +161,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -191,8 +184,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -216,8 +208,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -241,8 +232,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -266,8 +256,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -291,8 +280,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -316,8 +304,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -341,8 +328,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -366,8 +352,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -391,8 +376,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -416,8 +400,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -441,8 +424,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -466,8 +448,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -491,8 +472,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -517,8 +497,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -542,8 +521,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -566,8 +544,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -591,8 +568,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -616,8 +592,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -641,8 +616,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -666,8 +640,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -691,8 +664,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -716,8 +688,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -741,8 +712,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -766,8 +736,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -888,8 +857,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -913,8 +881,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -938,8 +905,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -963,8 +929,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1180,8 +1145,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1205,8 +1169,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1230,8 +1193,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1255,8 +1217,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1429,8 +1390,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_
|
||||
; GCN-NEXT: v_mov_b32_e32 v17, s1
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1451,8 +1411,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, s0
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1473,8 +1432,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, s0
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1512,8 +1470,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1543,8 +1500,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a3, v1
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0]
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 3
|
||||
; GISEL-NEXT: s_nop 11
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1573,8 +1529,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s20
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1599,8 +1554,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
|
||||
; GISEL-NEXT: v_mov_b32_e32 v8, s20
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0]
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 3
|
||||
; GISEL-NEXT: s_nop 11
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1629,8 +1583,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s20
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1655,8 +1608,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
|
||||
; GISEL-NEXT: v_mov_b32_e32 v8, s20
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0]
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 3
|
||||
; GISEL-NEXT: s_nop 11
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1685,8 +1637,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s20
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1711,8 +1662,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
|
||||
; GISEL-NEXT: v_mov_b32_e32 v8, s20
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 3
|
||||
; GISEL-NEXT: s_nop 11
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1733,8 +1683,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp
|
||||
; GCN-NEXT: v_mov_b32_e32 v17, s16
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1763,8 +1712,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s24
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1789,8 +1737,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
|
||||
; GISEL-NEXT: v_mov_b32_e32 v9, s24
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0]
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 3
|
||||
; GISEL-NEXT: s_nop 11
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1812,8 +1759,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1831,8 +1777,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
|
||||
; GISEL-NEXT: v_mov_b32_e32 v17, -2
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 3
|
||||
; GISEL-NEXT: s_nop 11
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1854,8 +1799,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1873,8 +1817,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
|
||||
; GISEL-NEXT: v_mov_b32_e32 v17, -2
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 3
|
||||
; GISEL-NEXT: s_nop 11
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1896,8 +1839,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1915,8 +1857,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
|
||||
; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 3
|
||||
; GISEL-NEXT: s_nop 11
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -1958,8 +1899,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
|
||||
; SDAG-NEXT: v_mov_b32_e32 v22, s13
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v22 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
@ -1983,8 +1923,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
|
||||
; GISEL-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31]
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
|
||||
@ -2022,8 +1961,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
@ -2048,8 +1986,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
|
||||
@ -2087,8 +2024,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
@ -2113,8 +2049,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
|
||||
@ -2152,8 +2087,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
@ -2178,8 +2112,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
|
||||
@ -2217,8 +2150,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
@ -2243,8 +2175,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
|
||||
@ -2263,8 +2194,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -2285,8 +2215,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -2308,8 +2237,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -2327,8 +2255,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8
|
||||
; GISEL-NEXT: v_mov_b32_e32 v17, 1
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 3
|
||||
; GISEL-NEXT: s_nop 11
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -2350,8 +2277,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -2369,8 +2295,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(
|
||||
; GISEL-NEXT: v_mov_b32_e32 v17, 0
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 3
|
||||
; GISEL-NEXT: s_nop 11
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -2394,8 +2319,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -2418,8 +2342,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -2488,8 +2411,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -2512,8 +2434,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -2536,8 +2457,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
@ -2560,8 +2480,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, v17
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -133,8 +133,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-SDAG-NEXT: s_nop 1
|
||||
; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-SDAG-NEXT: s_nop 7
|
||||
; GFX942-SDAG-NEXT: s_nop 1
|
||||
; GFX942-SDAG-NEXT: s_nop 9
|
||||
; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
|
||||
; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
@ -172,8 +171,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-GISEL-NEXT: s_nop 1
|
||||
; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-GISEL-NEXT: s_nop 7
|
||||
; GFX942-GISEL-NEXT: s_nop 1
|
||||
; GFX942-GISEL-NEXT: s_nop 9
|
||||
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
|
||||
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
@ -208,8 +206,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32_vgprcd(ptr addrspace(1) %ar
|
||||
; GFX942-SDAG-NEXT: s_nop 1
|
||||
; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-SDAG-NEXT: s_nop 7
|
||||
; GFX942-SDAG-NEXT: s_nop 1
|
||||
; GFX942-SDAG-NEXT: s_nop 9
|
||||
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
|
||||
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
|
||||
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
|
||||
@ -239,8 +236,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32_vgprcd(ptr addrspace(1) %ar
|
||||
; GFX942-GISEL-NEXT: s_nop 1
|
||||
; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-GISEL-NEXT: s_nop 7
|
||||
; GFX942-GISEL-NEXT: s_nop 1
|
||||
; GFX942-GISEL-NEXT: s_nop 9
|
||||
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
|
||||
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
|
||||
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
|
||||
|
||||
@ -28,8 +28,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-MINREG-NEXT: v_add_u32_e32 v3, 0x6000, v4
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 15
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v5, a[28:31] offset:112
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v5, a[24:27] offset:96
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v5, a[20:23] offset:80
|
||||
@ -51,8 +50,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 15
|
||||
; GCN-MINREG-NEXT: s_nop 2
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:8288
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:8304
|
||||
@ -75,8 +73,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 15
|
||||
; GCN-MINREG-NEXT: s_nop 2
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:16480
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:16496
|
||||
@ -99,8 +96,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 15
|
||||
; GCN-MINREG-NEXT: s_nop 2
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:24672
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:24688
|
||||
@ -123,8 +119,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 15
|
||||
; GCN-MINREG-NEXT: s_nop 2
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:32864
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:32880
|
||||
@ -159,8 +154,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-MAXOCC-NEXT: v_add_u32_e32 v1, s1, v1
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 15
|
||||
; GCN-MAXOCC-NEXT: s_nop 1
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:112
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:96
|
||||
@ -184,8 +178,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 15
|
||||
; GCN-MAXOCC-NEXT: s_nop 1
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:8288
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:8304
|
||||
@ -208,8 +201,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 15
|
||||
; GCN-MAXOCC-NEXT: s_nop 2
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:16480
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:16496
|
||||
@ -233,8 +225,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 15
|
||||
; GCN-MAXOCC-NEXT: s_nop 1
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:24672
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:24688
|
||||
@ -257,8 +248,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 15
|
||||
; GCN-MAXOCC-NEXT: s_nop 2
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:32864
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:32880
|
||||
@ -293,8 +283,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-ILP-NEXT: v_add_u32_e32 v0, s1, v0
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 15
|
||||
; GCN-ILP-NEXT: s_nop 1
|
||||
; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:112
|
||||
; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:96
|
||||
@ -315,8 +304,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
|
||||
; GCN-ILP-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 15
|
||||
; GCN-ILP-NEXT: s_nop 1
|
||||
; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:8288
|
||||
; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:8304
|
||||
@ -336,8 +324,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688
|
||||
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 15
|
||||
; GCN-ILP-NEXT: s_nop 2
|
||||
; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16400
|
||||
; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168
|
||||
@ -358,8 +345,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
|
||||
; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 15
|
||||
; GCN-ILP-NEXT: s_nop 1
|
||||
; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:24592
|
||||
; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360
|
||||
@ -383,8 +369,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 15
|
||||
; GCN-ILP-NEXT: s_nop 2
|
||||
; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:32864
|
||||
; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:32880
|
||||
@ -488,8 +473,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-MINREG-NEXT: v_add_u32_e32 v2, s1, v2
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 15
|
||||
; GCN-MINREG-NEXT: s_nop 1
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:112
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:96
|
||||
@ -513,8 +497,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 15
|
||||
; GCN-MINREG-NEXT: s_nop 1
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:8288
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:8304
|
||||
@ -539,8 +522,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v3
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 15
|
||||
; GCN-MINREG-NEXT: s_nop 1
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:16496
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:16480
|
||||
@ -563,8 +545,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 15
|
||||
; GCN-MINREG-NEXT: s_nop 2
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:24688
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:24672
|
||||
@ -587,8 +568,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 7
|
||||
; GCN-MINREG-NEXT: s_nop 15
|
||||
; GCN-MINREG-NEXT: s_nop 2
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:32880
|
||||
; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:32864
|
||||
@ -623,8 +603,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-MAXOCC-NEXT: v_add_u32_e32 v3, s1, v3
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 15
|
||||
; GCN-MAXOCC-NEXT: s_nop 1
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:112
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:96
|
||||
@ -648,8 +627,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 15
|
||||
; GCN-MAXOCC-NEXT: s_nop 1
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:8288
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:8304
|
||||
@ -673,8 +651,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 15
|
||||
; GCN-MAXOCC-NEXT: s_nop 2
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:16496
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:16480
|
||||
@ -698,8 +675,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 15
|
||||
; GCN-MAXOCC-NEXT: s_nop 1
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:24688
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:24672
|
||||
@ -722,8 +698,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 7
|
||||
; GCN-MAXOCC-NEXT: s_nop 15
|
||||
; GCN-MAXOCC-NEXT: s_nop 2
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:32880
|
||||
; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:32864
|
||||
@ -758,8 +733,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-ILP-NEXT: v_add_u32_e32 v2, s1, v2
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 15
|
||||
; GCN-ILP-NEXT: s_nop 1
|
||||
; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3]
|
||||
; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192
|
||||
@ -783,8 +757,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 15
|
||||
; GCN-ILP-NEXT: s_nop 1
|
||||
; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:8288
|
||||
; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:8304
|
||||
@ -808,8 +781,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 15
|
||||
; GCN-ILP-NEXT: s_nop 2
|
||||
; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:16496
|
||||
; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:16480
|
||||
@ -830,8 +802,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 15
|
||||
; GCN-ILP-NEXT: s_nop 1
|
||||
; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:24576
|
||||
; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344
|
||||
@ -855,8 +826,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 7
|
||||
; GCN-ILP-NEXT: s_nop 15
|
||||
; GCN-ILP-NEXT: s_nop 2
|
||||
; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:32880
|
||||
; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:32864
|
||||
|
||||
@ -678,8 +678,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
|
||||
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 4
|
||||
; GCN-NEXT: s_nop 12
|
||||
; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112
|
||||
; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96
|
||||
; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80
|
||||
@ -785,8 +784,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
|
||||
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
|
||||
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
|
||||
; EXACTCUTOFF-NEXT: s_nop 7
|
||||
; EXACTCUTOFF-NEXT: s_nop 4
|
||||
; EXACTCUTOFF-NEXT: s_nop 12
|
||||
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112
|
||||
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96
|
||||
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80
|
||||
@ -890,8 +888,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
|
||||
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 15
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
|
||||
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
|
||||
@ -915,8 +912,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 15
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288
|
||||
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304
|
||||
@ -939,8 +935,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 15
|
||||
; GCN-NEXT: s_nop 2
|
||||
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:16480
|
||||
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:16496
|
||||
@ -964,8 +959,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 15
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672
|
||||
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688
|
||||
@ -988,8 +982,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 15
|
||||
; GCN-NEXT: s_nop 2
|
||||
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864
|
||||
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880
|
||||
@ -1024,8 +1017,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0
|
||||
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; EXACTCUTOFF-NEXT: s_nop 7
|
||||
; EXACTCUTOFF-NEXT: s_nop 7
|
||||
; EXACTCUTOFF-NEXT: s_nop 15
|
||||
; EXACTCUTOFF-NEXT: s_nop 1
|
||||
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112
|
||||
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96
|
||||
@ -1049,8 +1041,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; EXACTCUTOFF-NEXT: s_nop 7
|
||||
; EXACTCUTOFF-NEXT: s_nop 7
|
||||
; EXACTCUTOFF-NEXT: s_nop 15
|
||||
; EXACTCUTOFF-NEXT: s_nop 1
|
||||
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288
|
||||
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304
|
||||
@ -1073,8 +1064,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; EXACTCUTOFF-NEXT: s_nop 7
|
||||
; EXACTCUTOFF-NEXT: s_nop 7
|
||||
; EXACTCUTOFF-NEXT: s_nop 15
|
||||
; EXACTCUTOFF-NEXT: s_nop 2
|
||||
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:16480
|
||||
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:16496
|
||||
@ -1098,8 +1088,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; EXACTCUTOFF-NEXT: s_nop 7
|
||||
; EXACTCUTOFF-NEXT: s_nop 7
|
||||
; EXACTCUTOFF-NEXT: s_nop 15
|
||||
; EXACTCUTOFF-NEXT: s_nop 1
|
||||
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:24672
|
||||
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:24688
|
||||
@ -1122,8 +1111,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
|
||||
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
|
||||
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
|
||||
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
|
||||
; EXACTCUTOFF-NEXT: s_nop 7
|
||||
; EXACTCUTOFF-NEXT: s_nop 7
|
||||
; EXACTCUTOFF-NEXT: s_nop 15
|
||||
; EXACTCUTOFF-NEXT: s_nop 2
|
||||
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864
|
||||
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880
|
||||
|
||||
@ -199,8 +199,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
; SDAG-NEXT: s_nop 10
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
||||
@ -232,8 +231,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
||||
@ -253,8 +251,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -316,8 +313,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -379,8 +375,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -471,8 +466,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, v9
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -685,8 +679,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 2
|
||||
; GCN-NEXT: s_nop 10
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
||||
@ -706,8 +699,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -734,8 +726,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -762,8 +753,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -819,8 +809,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
|
||||
; GCN-NEXT: v_mov_b32_e32 v27, v9
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: s_nop 11
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -1049,8 +1038,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
; SDAG-NEXT: s_nop 10
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
||||
@ -1082,8 +1070,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
@ -1103,8 +1090,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -1166,8 +1152,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -1229,8 +1214,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -1321,8 +1305,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, v9
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -2098,8 +2081,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
; SDAG-NEXT: s_nop 10
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
||||
@ -2131,8 +2113,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
@ -2152,8 +2133,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -2215,8 +2195,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -2278,8 +2257,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -2370,8 +2348,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, v9
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -2471,8 +2448,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
; SDAG-NEXT: s_nop 10
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
||||
@ -2504,8 +2480,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
@ -2525,8 +2500,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -2588,8 +2562,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -2651,8 +2624,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -2743,8 +2715,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, v9
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -2844,8 +2815,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
; SDAG-NEXT: s_nop 10
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
||||
@ -2877,8 +2847,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
@ -2898,8 +2867,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -2961,8 +2929,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -3024,8 +2991,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -3116,8 +3082,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, v9
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -3217,8 +3182,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
; SDAG-NEXT: s_nop 10
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
||||
@ -3250,8 +3214,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: s_nop 10
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
@ -3271,8 +3234,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -3334,8 +3296,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -3397,8 +3358,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
@ -3489,8 +3449,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, v9
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: s_nop 11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
||||
|
||||
@ -125,8 +125,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: sgemm32x32_mfma_write_agpr_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: sgemm32x32_mfma_write_agpr_mfma_read_overlap
|
||||
body: |
|
||||
@ -136,8 +135,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: sgemm32x32_mfma_write_vgpr_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: sgemm32x32_mfma_write_vgpr_mfma_read_overlap
|
||||
body: |
|
||||
@ -147,8 +145,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: S_NOP 8
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap
|
||||
body: |
|
||||
@ -196,8 +193,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: S_NOP 8
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
|
||||
body: |
|
||||
@ -207,8 +203,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
|
||||
@ -249,8 +244,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
|
||||
body: |
|
||||
@ -260,8 +254,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
|
||||
@ -312,8 +305,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
|
||||
body: |
|
||||
@ -333,8 +325,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
|
||||
body: |
|
||||
@ -384,8 +375,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
|
||||
body: |
|
||||
@ -435,8 +425,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfma16x16_write_vgpr_flat_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: smfma16x16_write_vgpr_flat_read
|
||||
body: |
|
||||
@ -446,8 +435,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfma32x32_write_vgpr_flat_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: smfma32x32_write_vgpr_flat_read
|
||||
@ -458,8 +446,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: S_NOP 8
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: dmfma4x4_write_vgpr_flat_read_overlap
|
||||
body: |
|
||||
@ -469,8 +456,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_full
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: S_NOP 8
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: dmfma4x4_write_vgpr_flat_read_full
|
||||
body: |
|
||||
@ -480,8 +466,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: dmfma16x16_write_vgpr_flat_read
|
||||
@ -502,8 +487,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfma16x16_write_vgpr_valu_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: smfma16x16_write_vgpr_valu_read
|
||||
body: |
|
||||
@ -513,8 +497,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfma32x32_write_vgpr_valu_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: smfma32x32_write_vgpr_valu_read
|
||||
@ -535,8 +518,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: dmfma16x16_write_vgpr_valu_read
|
||||
body: |
|
||||
@ -556,8 +538,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfma16x16_write_vgpr_accv_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
|
||||
name: smfma16x16_write_vgpr_accv_read
|
||||
body: |
|
||||
@ -567,8 +548,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfma32x32_write_vgpr_accv_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
|
||||
name: smfma32x32_write_vgpr_accv_read
|
||||
@ -599,8 +579,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_DOT
|
||||
name: dmfma16x16_write_vgpr_dot_read
|
||||
body: |
|
||||
@ -620,8 +599,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfma16x16_write_vgpr_valu_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: smfma16x16_write_vgpr_valu_write
|
||||
body: |
|
||||
@ -631,8 +609,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfma32x32_write_vgpr_valu_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: smfma32x32_write_vgpr_valu_write
|
||||
@ -653,8 +630,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfma16x16_write_vgpr_valu_f16_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_FMA_F16_e64
|
||||
name: smfma16x16_write_vgpr_valu_f16_write
|
||||
body: |
|
||||
@ -664,8 +640,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfma32x32_write_vgpr_valu_f16_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: V_FMA_F16_e64
|
||||
name: smfma32x32_write_vgpr_valu_f16_write
|
||||
@ -686,8 +661,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfma16x16_write_vgpr_valu_sdwa_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_MOV_B32_sdwa
|
||||
name: smfma16x16_write_vgpr_valu_sdwa_write
|
||||
body: |
|
||||
@ -697,8 +671,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfma32x32_write_vgpr_valu_sdwa_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: V_MOV_B32_sdwa
|
||||
name: smfma32x32_write_vgpr_valu_sdwa_write
|
||||
@ -719,8 +692,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: dmfma16x16_write_vgpr_valu_write
|
||||
body: |
|
||||
@ -770,8 +742,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfma32x32_read_srcc_vgpr_valu_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 6
|
||||
# GCN-NEXT: S_NOP 14
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: smfma32x32_read_srcc_vgpr_valu_write
|
||||
body: |
|
||||
@ -1040,8 +1011,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: S_NOP 8
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_agpr_mfma_read_overlap
|
||||
body: |
|
||||
@ -1080,8 +1050,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: S_NOP 8
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
|
||||
body: |
|
||||
@ -1091,8 +1060,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
|
||||
@ -1133,8 +1101,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
|
||||
body: |
|
||||
@ -1154,8 +1121,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
|
||||
body: |
|
||||
@ -1185,8 +1151,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
|
||||
body: |
|
||||
@ -1196,8 +1161,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: S_NOP 8
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: dmfma4x4_write_agpr_flat_read_overlap
|
||||
body: |
|
||||
@ -1207,8 +1171,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_full
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: S_NOP 8
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: dmfma4x4_write_agpr_flat_read_full
|
||||
body: |
|
||||
@ -1218,8 +1181,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: dmfma16x16_write_agpr_flat_read
|
||||
@ -1240,8 +1202,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma16x16_write_agpr_valu_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_ACCVGPR_READ_B32_e64
|
||||
name: dmfma16x16_write_agpr_valu_read
|
||||
body: |
|
||||
@ -1261,8 +1222,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma16x16_write_agpr_valu_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
|
||||
name: dmfma16x16_write_agpr_valu_write
|
||||
body: |
|
||||
|
||||
@ -178,11 +178,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_overlap
|
||||
body: |
|
||||
@ -192,11 +189,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_overlap
|
||||
body: |
|
||||
@ -225,11 +219,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_smfmac_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_SMFMAC
|
||||
name: xdl_sgemm16x16_mfma_write_agpr_smfmac_read_overlap
|
||||
body: |
|
||||
@ -239,8 +230,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GCN-NEXT: V_MFMA
|
||||
@ -252,8 +242,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_vgpr_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GCN-NEXT: V_MFMA
|
||||
@ -274,8 +263,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: nonxdl_sgemm32x32_mfma_write_agpr_nonxdl_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: nonxdl_sgemm32x32_mfma_write_agpr_nonxdl_mfma_read_overlap
|
||||
body: |
|
||||
@ -285,8 +273,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_smfmac_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GCN-NEXT: V_SMFMAC
|
||||
@ -298,11 +285,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 0
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap
|
||||
@ -323,11 +307,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 0
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap
|
||||
@ -358,9 +339,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: xdl_sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
|
||||
body: |
|
||||
@ -370,8 +350,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GCN-NEXT: V_MFMA
|
||||
@ -383,9 +362,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_partial
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_partial
|
||||
body: |
|
||||
@ -395,9 +373,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_partial
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_partial
|
||||
body: |
|
||||
@ -417,9 +394,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
|
||||
body: |
|
||||
@ -429,8 +405,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: nonxdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: nonxdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
|
||||
body: |
|
||||
@ -440,9 +415,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfmac32x32_write_agpr_mfma_srca_read_overlap
|
||||
# GCN: V_SMFMAC
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: smfmac32x32_write_agpr_mfma_srca_read_overlap
|
||||
body: |
|
||||
@ -452,9 +426,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap
|
||||
# GCN: V_SMFMAC
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: V_SMFMAC
|
||||
name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap
|
||||
body: |
|
||||
@ -464,8 +437,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GCN-NEXT: V_MFMA
|
||||
@ -477,8 +449,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: nonxdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: nonxdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
|
||||
@ -539,11 +510,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 2
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
|
||||
@ -564,11 +532,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 2
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
|
||||
@ -639,11 +604,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 2
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
|
||||
@ -654,11 +616,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_smfmac_srcb_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 2
|
||||
# GCN-NEXT: V_SMFMAC
|
||||
name: dgemm16x16_mfma_write_vgpr_smfmac_srcb_read_overlap
|
||||
@ -669,11 +628,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_smfmac_srcc_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 2
|
||||
|
||||
# GCN-NEXT: V_SMFMAC
|
||||
@ -746,9 +702,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_flat_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: xdl_smfma16x16_write_vgpr_flat_read
|
||||
body: |
|
||||
@ -758,9 +713,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfmac32x32_write_vgpr_flat_read
|
||||
# GCN: V_SMFMAC
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: smfmac32x32_write_vgpr_flat_read
|
||||
body: |
|
||||
@ -770,8 +724,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_flat_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
@ -783,8 +736,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: S_NOP 8
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: dmfma4x4_write_vgpr_flat_read_overlap
|
||||
body: |
|
||||
@ -794,8 +746,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_full
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: S_NOP 8
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: dmfma4x4_write_vgpr_flat_read_full
|
||||
body: |
|
||||
@ -805,8 +756,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: dmfma16x16_write_vgpr_flat_read
|
||||
@ -827,9 +777,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: xdl_smfma16x16_write_vgpr_valu_read
|
||||
body: |
|
||||
@ -839,8 +788,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
@ -862,11 +810,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 2
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: dmfma16x16_write_vgpr_valu_read
|
||||
@ -887,9 +832,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_accv_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
|
||||
name: xdl_smfma16x16_write_vgpr_accv_read
|
||||
body: |
|
||||
@ -899,8 +843,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_accv_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
|
||||
@ -932,11 +875,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 2
|
||||
|
||||
# GCN-NEXT: V_DOT
|
||||
@ -958,9 +898,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: xdl_smfma16x16_write_vgpr_valu_write
|
||||
body: |
|
||||
@ -970,8 +909,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
@ -993,9 +931,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_f16_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: V_FMA_F16_e64
|
||||
name: xdl_smfma16x16_write_vgpr_valu_f16_write
|
||||
body: |
|
||||
@ -1005,8 +942,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_f16_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GCN-NEXT: V_FMA_F16_e64
|
||||
@ -1028,9 +964,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_sdwa_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: V_MOV_B32_sdwa
|
||||
name: xdl_smfma16x16_write_vgpr_valu_sdwa_write
|
||||
body: |
|
||||
@ -1040,8 +975,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_sdwa_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GCN-NEXT: V_MOV_B32_sdwa
|
||||
@ -1063,8 +997,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: dmfma16x16_write_vgpr_valu_write
|
||||
body: |
|
||||
@ -1379,11 +1312,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 0
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_agpr_mfma_read_overlap
|
||||
@ -1404,11 +1334,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 0
|
||||
|
||||
# GCN-NEXT: V_MFMA
|
||||
@ -1430,9 +1357,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: xdl_sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
|
||||
body: |
|
||||
@ -1442,8 +1368,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GCN-NEXT: V_MFMA
|
||||
@ -1485,11 +1410,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 2
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
|
||||
@ -1510,11 +1432,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 2
|
||||
|
||||
# GCN-NEXT: V_MFMA
|
||||
@ -1546,11 +1465,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 2
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
|
||||
@ -1561,8 +1477,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_overlap
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: S_NOP 8
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: dmfma4x4_write_agpr_flat_read_overlap
|
||||
body: |
|
||||
@ -1572,8 +1487,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_full
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 0
|
||||
# GCN-NEXT: S_NOP 8
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: dmfma4x4_write_agpr_flat_read_full
|
||||
body: |
|
||||
@ -1583,8 +1497,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: FLAT_STORE_DWORD
|
||||
name: dmfma16x16_write_agpr_flat_read
|
||||
@ -1605,11 +1518,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma16x16_write_agpr_valu_read
|
||||
# GCN: V_MFMA
|
||||
# GFX942-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX950-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 15
|
||||
# GFX950-NEXT: S_NOP 2
|
||||
# GCN-NEXT: V_ACCVGPR_READ_B32_e64
|
||||
name: dmfma16x16_write_agpr_valu_read
|
||||
@ -1630,8 +1540,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: dmfma16x16_write_agpr_valu_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 2
|
||||
# GCN-NEXT: S_NOP 10
|
||||
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
|
||||
name: dmfma16x16_write_agpr_valu_write
|
||||
body: |
|
||||
@ -1840,9 +1749,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfmac32x32x32_mfma_write_agpr_mfma_read_overlap
|
||||
# GCN: V_SMFMAC
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_SMFMAC
|
||||
name: smfmac32x32x32_mfma_write_agpr_mfma_read_overlap
|
||||
body: |
|
||||
@ -1959,8 +1867,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_vm_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: S_NOP 9
|
||||
# GCN-NEXT: BUFFER_STORE_DWORD
|
||||
name: nonxdl_8pass_smfma16x16_write_vgpr_vm_read
|
||||
body: |
|
||||
@ -1970,8 +1877,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_valu_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: nonxdl_8pass_smfma16x16_write_vgpr_valu_read
|
||||
body: |
|
||||
@ -1981,8 +1887,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_valu_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: nonxdl_8pass_smfma16x16_write_vgpr_valu_write
|
||||
body: |
|
||||
@ -1992,8 +1897,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_vm_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: BUFFER_STORE_DWORD
|
||||
name: nonxdl_smfma32x32_write_vgpr_vm_read
|
||||
@ -2004,8 +1908,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_valu_read
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: nonxdl_smfma32x32_write_vgpr_valu_read
|
||||
@ -2016,8 +1919,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_valu_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: nonxdl_smfma32x32_write_vgpr_valu_write
|
||||
@ -2109,9 +2011,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: smfmac32x32_read_vgpr_srcc_valu_write
|
||||
# GCN: V_SMFMAC
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: smfmac32x32_read_vgpr_srcc_valu_write
|
||||
body: |
|
||||
@ -2121,8 +2022,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_sgemm32x32_mfma_read_vgpr_srcc_valu_write
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 6
|
||||
# GCN-NEXT: S_NOP 14
|
||||
# GCN-NEXT: V_MOV_B32
|
||||
name: xdl_sgemm32x32_mfma_read_vgpr_srcc_valu_write
|
||||
body: |
|
||||
@ -2337,9 +2237,8 @@ body: |
|
||||
# 8 pass source
|
||||
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc
|
||||
body: |
|
||||
@ -2353,9 +2252,8 @@ body: |
|
||||
# 8 pass source
|
||||
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
|
||||
body: |
|
||||
@ -2369,9 +2267,8 @@ body: |
|
||||
# 8 pass source
|
||||
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
|
||||
body: |
|
||||
@ -2385,8 +2282,7 @@ body: |
|
||||
# 16 pass source
|
||||
# GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GCN-NEXT: V_MFMA
|
||||
@ -2403,8 +2299,7 @@ body: |
|
||||
# 16 pass source
|
||||
# GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GCN-NEXT: V_MFMA
|
||||
@ -2420,8 +2315,7 @@ body: |
|
||||
# 16 pass source
|
||||
# GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GCN-NEXT: V_MFMA
|
||||
@ -2450,8 +2344,7 @@ body: |
|
||||
# 8 pass source
|
||||
# GCN-LABEL: name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca
|
||||
body: |
|
||||
@ -2464,8 +2357,7 @@ body: |
|
||||
# 8 pass source
|
||||
# GCN-LABEL: name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb
|
||||
body: |
|
||||
@ -2477,9 +2369,8 @@ body: |
|
||||
# 8 pass source
|
||||
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcc
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcc
|
||||
body: |
|
||||
@ -2492,9 +2383,8 @@ body: |
|
||||
# 8 pass source
|
||||
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca
|
||||
body: |
|
||||
@ -2507,9 +2397,8 @@ body: |
|
||||
# 8 pass source
|
||||
# GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GFX942-NEXT: S_NOP 10
|
||||
# GFX950-NEXT: S_NOP 11
|
||||
# GCN-NEXT: V_MFMA
|
||||
name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb
|
||||
body: |
|
||||
@ -2522,8 +2411,7 @@ body: |
|
||||
# 16 pass source
|
||||
# GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcc
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GCN-NEXT: V_MFMA
|
||||
@ -2539,8 +2427,7 @@ body: |
|
||||
# 16 pass source
|
||||
# GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srca
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GCN-NEXT: V_MFMA
|
||||
@ -2557,8 +2444,7 @@ body: |
|
||||
# 16 pass source
|
||||
# GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcb
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 2
|
||||
# GFX950-NEXT: S_NOP 3
|
||||
# GCN-NEXT: V_MFMA
|
||||
@ -2603,9 +2489,8 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_8pass_mfma_write_agpr_smfmac_read_overlap_srcc
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GFX942-NEXT: S_NOP 8
|
||||
# GFX950-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_SMFMAC_
|
||||
name: xdl_8pass_mfma_write_agpr_smfmac_read_overlap_srcc
|
||||
body: |
|
||||
@ -2617,8 +2502,7 @@ body: |
|
||||
...
|
||||
# GCN-LABEL: name: xdl_16pass_mfma_write_agpr_smfmac_read_overlap_srcc
|
||||
# GCN: V_MFMA
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GFX942-NEXT: S_NOP 0
|
||||
# GFX950-NEXT: S_NOP 1
|
||||
# GCN-NEXT: V_SMFMAC_
|
||||
|
||||
@ -15,8 +15,7 @@ body: |
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_NOP 7
|
||||
; GCN-NEXT: S_NOP 1
|
||||
; GCN-NEXT: S_NOP 9
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
|
||||
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
|
||||
@ -37,8 +36,7 @@ body: |
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 1, 1, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_NOP 7
|
||||
; GCN-NEXT: S_NOP 1
|
||||
; GCN-NEXT: S_NOP 9
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
|
||||
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 1, 1, implicit $mode, implicit $exec
|
||||
@ -59,8 +57,7 @@ body: |
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 0, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_NOP 7
|
||||
; GCN-NEXT: S_NOP 1
|
||||
; GCN-NEXT: S_NOP 9
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
|
||||
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 0, implicit $mode, implicit $exec
|
||||
@ -81,8 +78,7 @@ body: |
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 2, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_NOP 7
|
||||
; GCN-NEXT: S_NOP 1
|
||||
; GCN-NEXT: S_NOP 9
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
|
||||
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 2, implicit $mode, implicit $exec
|
||||
@ -163,8 +159,7 @@ body: |
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_NOP 7
|
||||
; GCN-NEXT: S_NOP 7
|
||||
; GCN-NEXT: S_NOP 15
|
||||
; GCN-NEXT: S_NOP 1
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
|
||||
@ -186,8 +181,7 @@ body: |
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_NOP 7
|
||||
; GCN-NEXT: S_NOP 1
|
||||
; GCN-NEXT: S_NOP 9
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
|
||||
renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
|
||||
@ -208,8 +202,7 @@ body: |
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_NOP 7
|
||||
; GCN-NEXT: S_NOP 7
|
||||
; GCN-NEXT: S_NOP 15
|
||||
; GCN-NEXT: S_NOP 1
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
|
||||
@ -231,8 +224,7 @@ body: |
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_NOP 7
|
||||
; GCN-NEXT: S_NOP 1
|
||||
; GCN-NEXT: S_NOP 9
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
|
||||
renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec
|
||||
@ -253,8 +245,7 @@ body: |
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_NOP 7
|
||||
; GCN-NEXT: S_NOP 3
|
||||
; GCN-NEXT: S_NOP 11
|
||||
; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
|
||||
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec
|
||||
|
||||
@ -157,8 +157,7 @@ body: |
|
||||
|
||||
# GCN-LABEL: name: mfma_16x16_write_agpr_accvgpr_read
|
||||
# GCN: V_MFMA_F32_16X16X1F32
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: S_NOP 9
|
||||
# GCN-NEXT: V_ACCVGPR_READ_B32_e64
|
||||
name: mfma_16x16_write_agpr_accvgpr_read
|
||||
body: |
|
||||
@ -170,8 +169,7 @@ body: |
|
||||
|
||||
# GCN-LABEL: name: mfma_32x32_write_agpr_accvgpr_read
|
||||
# GCN: V_MFMA_F32_32X32X2F32
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 15
|
||||
# GCN-NEXT: S_NOP 1
|
||||
# GCN-NEXT: V_ACCVGPR_READ_B32_e64
|
||||
name: mfma_32x32_write_agpr_accvgpr_read
|
||||
@ -208,8 +206,7 @@ body: |
|
||||
|
||||
# GCN-LABEL: name: mfma_32x32_write_agpr_accvgpr_write
|
||||
# GCN: V_MFMA_F32_32X32X2F32
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 6
|
||||
# GCN-NEXT: S_NOP 14
|
||||
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
|
||||
name: mfma_32x32_write_agpr_accvgpr_write
|
||||
body: |
|
||||
@ -244,8 +241,7 @@ body: |
|
||||
|
||||
# GCN-LABEL: name: mfma_32x32_read_srcc_accvgpr_write
|
||||
# GCN: V_MFMA_F32_32X32X2F32
|
||||
# GCN-NEXT: S_NOP 7
|
||||
# GCN-NEXT: S_NOP 4
|
||||
# GCN-NEXT: S_NOP 12
|
||||
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
|
||||
name: mfma_32x32_read_srcc_accvgpr_write
|
||||
body: |
|
||||
|
||||
@ -84,8 +84,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg)
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 15
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -227,8 +226,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg)
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 15
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -347,8 +345,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 15
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -454,8 +451,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 15
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -561,8 +557,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 15
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -690,8 +685,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg)
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 15
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -835,8 +829,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(
|
||||
; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 15
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v6, a27
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v5, a26
|
||||
@ -977,8 +970,7 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 15
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
|
||||
@ -1079,8 +1071,7 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg)
|
||||
; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 15
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
|
||||
|
||||
@ -54,8 +54,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
|
||||
; GFX908-NEXT: s_cbranch_scc1 .LBB0_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %exit
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 5
|
||||
; GFX908-NEXT: s_nop 13
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
|
||||
@ -148,8 +147,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 4
|
||||
; GFX90A-NEXT: s_nop 12
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -208,8 +206,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 3
|
||||
; GFX942-NEXT: s_nop 11
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -288,8 +285,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
|
||||
; GFX908-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %exit
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 5
|
||||
; GFX908-NEXT: s_nop 13
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
|
||||
@ -383,8 +379,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 4
|
||||
; GFX90A-NEXT: s_nop 12
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -444,8 +439,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 3
|
||||
; GFX942-NEXT: s_nop 11
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -518,8 +512,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
|
||||
; GFX908-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %exit
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 5
|
||||
; GFX908-NEXT: s_nop 13
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
|
||||
@ -612,8 +605,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 4
|
||||
; GFX90A-NEXT: s_nop 12
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -672,8 +664,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 3
|
||||
; GFX942-NEXT: s_nop 11
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -783,8 +774,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
|
||||
; GFX908-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %exit
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 5
|
||||
; GFX908-NEXT: s_nop 13
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
|
||||
@ -909,8 +899,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 4
|
||||
; GFX90A-NEXT: s_nop 12
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -1001,8 +990,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 3
|
||||
; GFX942-NEXT: s_nop 11
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -1075,8 +1063,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
|
||||
; GFX908-NEXT: s_cbranch_scc1 .LBB4_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %exit
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 5
|
||||
; GFX908-NEXT: s_nop 13
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
|
||||
@ -1170,8 +1157,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 4
|
||||
; GFX90A-NEXT: s_nop 12
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -1231,8 +1217,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 3
|
||||
; GFX942-NEXT: s_nop 11
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -1344,8 +1329,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
|
||||
; GFX908-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %exit
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 5
|
||||
; GFX908-NEXT: s_nop 13
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
|
||||
@ -1441,8 +1425,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 4
|
||||
; GFX90A-NEXT: s_nop 12
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -1504,8 +1487,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 3
|
||||
; GFX942-NEXT: s_nop 11
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -1614,8 +1596,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
|
||||
; GFX908-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %exit
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 5
|
||||
; GFX908-NEXT: s_nop 13
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
|
||||
@ -1712,8 +1693,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 4
|
||||
; GFX90A-NEXT: s_nop 12
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -1776,8 +1756,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 3
|
||||
; GFX942-NEXT: s_nop 11
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -1856,8 +1835,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
|
||||
; GFX908-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %exit
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 5
|
||||
; GFX908-NEXT: s_nop 13
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
|
||||
@ -1919,8 +1897,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 4
|
||||
; GFX90A-NEXT: s_nop 12
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -1948,8 +1925,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 3
|
||||
; GFX942-NEXT: s_nop 11
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -2019,8 +1995,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
|
||||
; GFX908-NEXT: s_mov_b32 s0, 16
|
||||
; GFX908-NEXT: s_nop 0
|
||||
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 15
|
||||
; GFX908-NEXT: s_nop 1
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
|
||||
; GFX908-NEXT: s_nop 1
|
||||
@ -2065,8 +2040,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
|
||||
; GFX908-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %exit
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 5
|
||||
; GFX908-NEXT: s_nop 13
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
|
||||
@ -2118,8 +2092,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: s_mov_b32 s0, 16
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
|
||||
; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
|
||||
@ -2163,8 +2136,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 4
|
||||
; GFX90A-NEXT: s_nop 12
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -2182,8 +2154,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-NEXT: s_mov_b32 s0, 16
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
|
||||
; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
|
||||
@ -2227,8 +2198,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 3
|
||||
; GFX942-NEXT: s_nop 11
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -2349,8 +2319,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
|
||||
; GFX908-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; GFX908-NEXT: ; %bb.4: ; %exit
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 2
|
||||
; GFX908-NEXT: s_nop 10
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
|
||||
; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
|
||||
@ -2453,8 +2422,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: s_nop 9
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
@ -2523,8 +2491,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: s_nop 8
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
|
||||
|
||||
@ -93,8 +93,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GREEDY908-NEXT: s_nop 0
|
||||
; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
|
||||
; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v3, v0, a[0:31]
|
||||
; GREEDY908-NEXT: s_nop 7
|
||||
; GREEDY908-NEXT: s_nop 7
|
||||
; GREEDY908-NEXT: s_nop 15
|
||||
; GREEDY908-NEXT: s_nop 1
|
||||
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a32
|
||||
; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a61
|
||||
@ -158,8 +157,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v5
|
||||
; GREEDY908-NEXT: s_nop 0
|
||||
; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
|
||||
; GREEDY908-NEXT: s_nop 7
|
||||
; GREEDY908-NEXT: s_nop 7
|
||||
; GREEDY908-NEXT: s_nop 15
|
||||
; GREEDY908-NEXT: s_nop 1
|
||||
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a27
|
||||
; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a26
|
||||
@ -263,8 +261,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GREEDY90A-NEXT: s_nop 1
|
||||
; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
|
||||
; GREEDY90A-NEXT: s_nop 7
|
||||
; GREEDY90A-NEXT: s_nop 7
|
||||
; GREEDY90A-NEXT: s_nop 15
|
||||
; GREEDY90A-NEXT: s_nop 2
|
||||
; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a32
|
||||
; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a33
|
||||
@ -298,8 +295,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GREEDY90A-NEXT: v_accvgpr_mov_b32 a31, a61
|
||||
; GREEDY90A-NEXT: s_nop 1
|
||||
; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; GREEDY90A-NEXT: s_nop 7
|
||||
; GREEDY90A-NEXT: s_nop 7
|
||||
; GREEDY90A-NEXT: s_nop 15
|
||||
; GREEDY90A-NEXT: s_nop 2
|
||||
; GREEDY90A-NEXT: global_store_dwordx4 v2, a[24:27], s[34:35] offset:96
|
||||
; GREEDY90A-NEXT: global_store_dwordx4 v2, a[28:31], s[34:35] offset:112
|
||||
@ -356,8 +352,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GREEDY942-NEXT: s_nop 1
|
||||
; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
|
||||
; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[32:63], v0, v1, a[0:31]
|
||||
; GREEDY942-NEXT: s_nop 7
|
||||
; GREEDY942-NEXT: s_nop 7
|
||||
; GREEDY942-NEXT: s_nop 15
|
||||
; GREEDY942-NEXT: s_nop 1
|
||||
; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a32
|
||||
; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a33
|
||||
@ -391,8 +386,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GREEDY942-NEXT: v_accvgpr_mov_b32 a31, a61
|
||||
; GREEDY942-NEXT: s_nop 1
|
||||
; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
|
||||
; GREEDY942-NEXT: s_nop 7
|
||||
; GREEDY942-NEXT: s_nop 7
|
||||
; GREEDY942-NEXT: s_nop 15
|
||||
; GREEDY942-NEXT: s_nop 1
|
||||
; GREEDY942-NEXT: global_store_dwordx4 v2, a[24:27], s[34:35] offset:96
|
||||
; GREEDY942-NEXT: global_store_dwordx4 v2, a[28:31], s[34:35] offset:112
|
||||
@ -448,8 +442,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 1
|
||||
; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 7
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 7
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 15
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 2
|
||||
; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a2, a32
|
||||
; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a3, a33
|
||||
@ -484,8 +477,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 1
|
||||
; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
|
||||
; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 7
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 7
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 15
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 1
|
||||
; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
|
||||
; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
|
||||
@ -542,8 +534,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; FAST90A-NEXT: s_nop 1
|
||||
; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
|
||||
; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63]
|
||||
; FAST90A-NEXT: s_nop 7
|
||||
; FAST90A-NEXT: s_nop 7
|
||||
; FAST90A-NEXT: s_nop 15
|
||||
; FAST90A-NEXT: s_nop 2
|
||||
; FAST90A-NEXT: v_accvgpr_read_b32 v3, a29
|
||||
; FAST90A-NEXT: v_accvgpr_read_b32 v4, a28
|
||||
@ -609,8 +600,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; FAST90A-NEXT: v_accvgpr_write_b32 a31, v3
|
||||
; FAST90A-NEXT: s_nop 1
|
||||
; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
|
||||
; FAST90A-NEXT: s_nop 7
|
||||
; FAST90A-NEXT: s_nop 7
|
||||
; FAST90A-NEXT: s_nop 15
|
||||
; FAST90A-NEXT: s_nop 2
|
||||
; FAST90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
; FAST90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
@ -676,8 +666,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GREEDY908-NEXT: s_nop 1
|
||||
; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
|
||||
; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
|
||||
; GREEDY908-NEXT: s_nop 7
|
||||
; GREEDY908-NEXT: s_nop 0
|
||||
; GREEDY908-NEXT: s_nop 8
|
||||
; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19
|
||||
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a18
|
||||
; GREEDY908-NEXT: s_nop 0
|
||||
@ -685,8 +674,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3
|
||||
; GREEDY908-NEXT: s_nop 0
|
||||
; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
|
||||
; GREEDY908-NEXT: s_nop 7
|
||||
; GREEDY908-NEXT: s_nop 1
|
||||
; GREEDY908-NEXT: s_nop 9
|
||||
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15
|
||||
; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14
|
||||
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13
|
||||
@ -744,14 +732,12 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GREEDY90A-NEXT: s_nop 1
|
||||
; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
|
||||
; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
|
||||
; GREEDY90A-NEXT: s_nop 7
|
||||
; GREEDY90A-NEXT: s_nop 1
|
||||
; GREEDY90A-NEXT: s_nop 9
|
||||
; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18
|
||||
; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19
|
||||
; GREEDY90A-NEXT: s_nop 1
|
||||
; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
|
||||
; GREEDY90A-NEXT: s_nop 7
|
||||
; GREEDY90A-NEXT: s_nop 2
|
||||
; GREEDY90A-NEXT: s_nop 10
|
||||
; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
|
||||
; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
|
||||
; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
|
||||
@ -786,14 +772,12 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GREEDY942-NEXT: s_nop 1
|
||||
; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33]
|
||||
; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33]
|
||||
; GREEDY942-NEXT: s_nop 7
|
||||
; GREEDY942-NEXT: s_nop 0
|
||||
; GREEDY942-NEXT: s_nop 8
|
||||
; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18
|
||||
; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19
|
||||
; GREEDY942-NEXT: s_nop 1
|
||||
; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
|
||||
; GREEDY942-NEXT: s_nop 7
|
||||
; GREEDY942-NEXT: s_nop 1
|
||||
; GREEDY942-NEXT: s_nop 9
|
||||
; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
|
||||
; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
|
||||
; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
|
||||
@ -827,8 +811,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 1
|
||||
; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
|
||||
; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15]
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 7
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 2
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 10
|
||||
; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a2, a16
|
||||
; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a3, a17
|
||||
; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a4, a18
|
||||
@ -846,8 +829,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 1
|
||||
; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
|
||||
; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 7
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 1
|
||||
; GREEDY90A-GISEL-NEXT: s_nop 9
|
||||
; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
|
||||
; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
|
||||
; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
|
||||
@ -882,8 +864,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; FAST90A-NEXT: s_nop 1
|
||||
; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
|
||||
; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15]
|
||||
; FAST90A-NEXT: s_nop 7
|
||||
; FAST90A-NEXT: s_nop 2
|
||||
; FAST90A-NEXT: s_nop 10
|
||||
; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16
|
||||
; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17
|
||||
; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a18
|
||||
@ -900,8 +881,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
|
||||
; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a29
|
||||
; FAST90A-NEXT: s_nop 1
|
||||
; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
|
||||
; FAST90A-NEXT: s_nop 7
|
||||
; FAST90A-NEXT: s_nop 2
|
||||
; FAST90A-NEXT: s_nop 10
|
||||
; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
|
||||
; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
|
||||
; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
|
||||
@ -372,14 +372,12 @@ body: |
|
||||
;
|
||||
; gfx908-PAD75-LABEL: name: mfma_padding_16_pass
|
||||
; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; gfx908-PAD75-NEXT: S_NOP 7
|
||||
; gfx908-PAD75-NEXT: S_NOP 3
|
||||
; gfx908-PAD75-NEXT: S_NOP 11
|
||||
; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
;
|
||||
; gfx908-PAD100-LABEL: name: mfma_padding_16_pass
|
||||
; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; gfx908-PAD100-NEXT: S_NOP 7
|
||||
; gfx908-PAD100-NEXT: S_NOP 7
|
||||
; gfx908-PAD100-NEXT: S_NOP 15
|
||||
; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
;
|
||||
; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass
|
||||
@ -393,8 +391,7 @@ body: |
|
||||
;
|
||||
; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass
|
||||
; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; gfx90a-PAD100-NEXT: S_NOP 7
|
||||
; gfx90a-PAD100-NEXT: S_NOP 7
|
||||
; gfx90a-PAD100-NEXT: S_NOP 15
|
||||
; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
;
|
||||
; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass
|
||||
@ -408,8 +405,7 @@ body: |
|
||||
;
|
||||
; gfx942-PAD100-LABEL: name: mfma_padding_16_pass
|
||||
; gfx942-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; gfx942-PAD100-NEXT: S_NOP 7
|
||||
; gfx942-PAD100-NEXT: S_NOP 7
|
||||
; gfx942-PAD100-NEXT: S_NOP 15
|
||||
; gfx942-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
@ -459,8 +455,7 @@ body: |
|
||||
; gfx908-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; gfx908-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
; gfx908-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
; gfx908-PAD100-NEXT: S_NOP 7
|
||||
; gfx908-PAD100-NEXT: S_NOP 3
|
||||
; gfx908-PAD100-NEXT: S_NOP 11
|
||||
; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
;
|
||||
; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu
|
||||
@ -486,8 +481,7 @@ body: |
|
||||
; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; gfx90a-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
; gfx90a-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
; gfx90a-PAD100-NEXT: S_NOP 7
|
||||
; gfx90a-PAD100-NEXT: S_NOP 3
|
||||
; gfx90a-PAD100-NEXT: S_NOP 11
|
||||
; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
;
|
||||
; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu
|
||||
@ -513,8 +507,7 @@ body: |
|
||||
; gfx942-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; gfx942-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
|
||||
; gfx942-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
|
||||
; gfx942-PAD100-NEXT: S_NOP 7
|
||||
; gfx942-PAD100-NEXT: S_NOP 3
|
||||
; gfx942-PAD100-NEXT: S_NOP 11
|
||||
; gfx942-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
|
||||
@ -887,8 +880,7 @@ body: |
|
||||
; gfx908-PAD75-NEXT: {{ $}}
|
||||
; gfx908-PAD75-NEXT: bb.2:
|
||||
; gfx908-PAD75-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; gfx908-PAD75-NEXT: S_NOP 7
|
||||
; gfx908-PAD75-NEXT: S_NOP 1
|
||||
; gfx908-PAD75-NEXT: S_NOP 9
|
||||
; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
;
|
||||
; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_2_preds
|
||||
@ -905,8 +897,7 @@ body: |
|
||||
; gfx908-PAD100-NEXT: {{ $}}
|
||||
; gfx908-PAD100-NEXT: bb.2:
|
||||
; gfx908-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; gfx908-PAD100-NEXT: S_NOP 7
|
||||
; gfx908-PAD100-NEXT: S_NOP 5
|
||||
; gfx908-PAD100-NEXT: S_NOP 13
|
||||
; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
;
|
||||
; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds
|
||||
@ -956,8 +947,7 @@ body: |
|
||||
; gfx90a-PAD100-NEXT: {{ $}}
|
||||
; gfx90a-PAD100-NEXT: bb.2:
|
||||
; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; gfx90a-PAD100-NEXT: S_NOP 7
|
||||
; gfx90a-PAD100-NEXT: S_NOP 5
|
||||
; gfx90a-PAD100-NEXT: S_NOP 13
|
||||
; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
;
|
||||
; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds
|
||||
@ -1007,8 +997,7 @@ body: |
|
||||
; gfx942-PAD100-NEXT: {{ $}}
|
||||
; gfx942-PAD100-NEXT: bb.2:
|
||||
; gfx942-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
|
||||
; gfx942-PAD100-NEXT: S_NOP 7
|
||||
; gfx942-PAD100-NEXT: S_NOP 5
|
||||
; gfx942-PAD100-NEXT: S_NOP 13
|
||||
; gfx942-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
bb.0:
|
||||
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
|
||||
|
||||
@ -33,8 +33,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) %
|
||||
; CHECK-NEXT: .LBB0_2:
|
||||
; CHECK-NEXT: ; implicit-def: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
|
||||
; CHECK-NEXT: .LBB0_3: ; %if
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 15
|
||||
; CHECK-NEXT: global_load_dwordx4 a[28:31], v32, s[0:1] offset:112
|
||||
; CHECK-NEXT: global_load_dwordx4 a[24:27], v32, s[0:1] offset:96
|
||||
; CHECK-NEXT: global_load_dwordx4 a[20:23], v32, s[0:1] offset:80
|
||||
@ -98,8 +97,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi_loop(ptr addrspace
|
||||
; CHECK-NEXT: .LBB1_1: ; %loop
|
||||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 15
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[62:63], v[30:31]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[60:61], v[28:29]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[58:59], v[26:27]
|
||||
|
||||
@ -60,8 +60,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
|
||||
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a0, a1, v[0:31]
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 15
|
||||
; CHECK-NEXT: s_nop 1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, v32
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, v33
|
||||
@ -96,8 +95,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
|
||||
; CHECK-NEXT: v_mov_b32_e32 v32, 0
|
||||
; CHECK-NEXT: s_nop 0
|
||||
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[0:31]
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 15
|
||||
; CHECK-NEXT: s_nop 1
|
||||
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
|
||||
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
|
||||
@ -143,8 +141,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(
|
||||
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
|
||||
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v32, 0
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 15
|
||||
; CHECK-NEXT: s_nop 0
|
||||
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
|
||||
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
|
||||
@ -178,8 +175,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2(
|
||||
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v32, 0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 15
|
||||
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
|
||||
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
|
||||
; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
|
||||
@ -212,8 +208,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2(
|
||||
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v32, 0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 15
|
||||
; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
|
||||
; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
|
||||
; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
|
||||
@ -351,8 +346,7 @@ define void @test_rewrite_mfma_subreg_extract2(float %arg0, float %arg1, ptr add
|
||||
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 15
|
||||
; CHECK-NEXT: s_nop 1
|
||||
; CHECK-NEXT: v_accvgpr_mov_b32 a0, a1
|
||||
; CHECK-NEXT: v_accvgpr_mov_b32 a1, a2
|
||||
@ -717,8 +711,7 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_chain(p
|
||||
; CHECK-NEXT: s_nop 0
|
||||
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v34, a[0:31]
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 15
|
||||
; CHECK-NEXT: s_nop 0
|
||||
; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
|
||||
; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
|
||||
@ -777,8 +770,7 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
|
||||
; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1]
|
||||
; CHECK-NEXT: s_nop 7
|
||||
; CHECK-NEXT: s_nop 0
|
||||
; CHECK-NEXT: s_nop 8
|
||||
; CHECK-NEXT: global_store_dwordx2 v[2:3], a[0:1], off
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
|
||||
@ -258,8 +258,7 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
|
||||
; GFX908-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v4, v4, a[0:31]
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX908-NEXT: s_nop 7
|
||||
; GFX908-NEXT: s_nop 5
|
||||
; GFX908-NEXT: s_nop 13
|
||||
; GFX908-NEXT: v_accvgpr_write_b32 a1, v5
|
||||
; GFX908-NEXT: ;;#ASMSTART
|
||||
; GFX908-NEXT: ;;#ASMEND
|
||||
@ -339,8 +338,7 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v2, a[0:31]
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 7
|
||||
; GFX90A-NEXT: s_nop 15
|
||||
; GFX90A-NEXT: s_nop 2
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
||||
; GFX90A-NEXT: ;;#ASMSTART
|
||||
|
||||
@ -958,8 +958,7 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 7
|
||||
; GFX942-NEXT: s_nop 15
|
||||
; GFX942-NEXT: s_nop 2
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[40:41] offset:112
|
||||
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[40:41] offset:96
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user