AMDGPU: Disable AGPR allocation in VGPR MFMA tests (#150873)
This commit is contained in:
parent
f9f68af4b8
commit
6fb8e58565
@ -252,62 +252,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
|
||||
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
|
||||
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v44, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a31, s23
|
||||
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a30, s22
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a29, s21
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a28, s20
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a27, s19
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a26, s18
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a25, s17
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a24, s16
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a23, s15
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a22, s14
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a21, s13
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a20, s12
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a19, s11
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a18, s10
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a17, s9
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a16, s8
|
||||
; GCN-NEXT: v_mov_b32_e32 v10, s20
|
||||
; GCN-NEXT: v_mov_b32_e32 v11, s21
|
||||
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31]
|
||||
; GCN-NEXT: v_mov_b32_e32 v12, s22
|
||||
; GCN-NEXT: v_mov_b32_e32 v13, s23
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s17
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s18
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s19
|
||||
; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
|
||||
; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
|
||||
; GCN-NEXT: v_mov_b32_e32 v40, s20
|
||||
; GCN-NEXT: v_mov_b32_e32 v41, s21
|
||||
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
|
||||
; GCN-NEXT: v_mov_b32_e32 v42, s22
|
||||
; GCN-NEXT: v_mov_b32_e32 v43, s23
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
|
||||
; GCN-NEXT: s_nop 2
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, s16
|
||||
; GCN-NEXT: v_mov_b32_e32 v17, s17
|
||||
; GCN-NEXT: v_mov_b32_e32 v18, s18
|
||||
; GCN-NEXT: v_mov_b32_e32 v19, s19
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s13
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s14
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s15
|
||||
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, s12
|
||||
; GCN-NEXT: v_mov_b32_e32 v17, s13
|
||||
; GCN-NEXT: v_mov_b32_e32 v18, s14
|
||||
; GCN-NEXT: v_mov_b32_e32 v19, s15
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s10
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s11
|
||||
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, s8
|
||||
; GCN-NEXT: v_mov_b32_e32 v17, s9
|
||||
; GCN-NEXT: v_mov_b32_e32 v18, s10
|
||||
; GCN-NEXT: v_mov_b32_e32 v19, s11
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_endpgm
|
||||
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
|
||||
@ -322,62 +315,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
|
||||
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
|
||||
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v44, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a31, s23
|
||||
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a30, s22
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a29, s21
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a28, s20
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a27, s19
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a26, s18
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a25, s17
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a24, s16
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a23, s15
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a22, s14
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a21, s13
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a20, s12
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a19, s11
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a18, s10
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a17, s9
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a16, s8
|
||||
; GCN-NEXT: v_mov_b32_e32 v10, s20
|
||||
; GCN-NEXT: v_mov_b32_e32 v11, s21
|
||||
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
|
||||
; GCN-NEXT: v_mov_b32_e32 v12, s22
|
||||
; GCN-NEXT: v_mov_b32_e32 v13, s23
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s17
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s18
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s19
|
||||
; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
|
||||
; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
|
||||
; GCN-NEXT: v_mov_b32_e32 v40, s20
|
||||
; GCN-NEXT: v_mov_b32_e32 v41, s21
|
||||
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
|
||||
; GCN-NEXT: v_mov_b32_e32 v42, s22
|
||||
; GCN-NEXT: v_mov_b32_e32 v43, s23
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
|
||||
; GCN-NEXT: s_nop 2
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, s16
|
||||
; GCN-NEXT: v_mov_b32_e32 v17, s17
|
||||
; GCN-NEXT: v_mov_b32_e32 v18, s18
|
||||
; GCN-NEXT: v_mov_b32_e32 v19, s19
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s13
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s14
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s15
|
||||
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, s12
|
||||
; GCN-NEXT: v_mov_b32_e32 v17, s13
|
||||
; GCN-NEXT: v_mov_b32_e32 v18, s14
|
||||
; GCN-NEXT: v_mov_b32_e32 v19, s15
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s10
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s11
|
||||
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, s8
|
||||
; GCN-NEXT: v_mov_b32_e32 v17, s9
|
||||
; GCN-NEXT: v_mov_b32_e32 v18, s10
|
||||
; GCN-NEXT: v_mov_b32_e32 v19, s11
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
|
||||
; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_endpgm
|
||||
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
|
||||
@ -393,35 +379,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
|
||||
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a0, s8
|
||||
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a2, s10
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, s11
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a4, s12
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a5, s13
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a6, s14
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a7, s15
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a8, s16
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a9, s17
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a10, s18
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a11, s19
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a12, s20
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
|
||||
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 2
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
|
||||
store <16 x float> %result, ptr addrspace(1) %out
|
||||
@ -435,40 +413,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
|
||||
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a0, s8
|
||||
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a2, s10
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, s11
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a4, s12
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a5, s13
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a6, s14
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a7, s15
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a8, s16
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a9, s17
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a10, s18
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a11, s19
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a12, s20
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
|
||||
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 2
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1)
|
||||
store <16 x float> %result, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
|
||||
attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
|
||||
attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1895,36 +1895,36 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
|
||||
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s15
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s23
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s23
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, s13
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s12, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s12, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[14:15]
|
||||
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
|
||||
@ -1937,20 +1937,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a0, s24
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a1, s25
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a2, s26
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a3, s27
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, s29
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v20, s29
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s28, v20 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
|
||||
; GISEL-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31]
|
||||
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31]
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
|
||||
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
|
||||
@ -1964,40 +1962,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
|
||||
; SDAG-NEXT: s_movk_i32 s6, 0x41
|
||||
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s15
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s23
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s15
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s23
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
|
||||
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
|
||||
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
|
||||
; GISEL-NEXT: v_mov_b32_e32 v20, 0x41
|
||||
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
||||
@ -2005,19 +2001,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
|
||||
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
|
||||
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
|
||||
@ -2031,40 +2025,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
|
||||
; SDAG-NEXT: s_movk_i32 s6, 0x41
|
||||
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s15
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s23
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s15
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s23
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
|
||||
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
|
||||
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
|
||||
; GISEL-NEXT: v_mov_b32_e32 v20, 0x41
|
||||
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
||||
@ -2072,19 +2064,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
|
||||
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
|
||||
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
|
||||
@ -2096,34 +2086,32 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, 0
|
||||
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s15
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s23
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s15
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s23
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
|
||||
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
|
||||
@ -2136,21 +2124,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
|
||||
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
|
||||
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
|
||||
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
|
||||
@ -2162,34 +2148,32 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, 0
|
||||
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s15
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s23
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s15
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s23
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 3
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
|
||||
; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
|
||||
@ -2202,21 +2186,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
|
||||
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
|
||||
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
|
||||
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
|
||||
@ -2559,5 +2541,5 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6
|
||||
declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
|
||||
|
||||
attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
|
||||
attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
|
||||
attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
|
||||
|
@ -4539,49 +4539,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
|
||||
; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a0, s36
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s23
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a1, s37
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a2, s38
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a3, s39
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a4, s40
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a5, s41
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a6, s42
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a7, s43
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a8, s44
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a9, s45
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a10, s46
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a11, s47
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a12, s48
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a13, s49
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a14, s50
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a15, s51
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v22, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v23, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v28, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v29, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v30, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v31, s23
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v32, s1
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
|
||||
@ -4590,41 +4582,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
|
||||
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
|
||||
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a0, s36
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a1, s37
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a2, s38
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a3, s39
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a4, s40
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a5, s41
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a6, s42
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a7, s43
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a8, s44
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a9, s45
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a10, s46
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a11, s47
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a12, s48
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a13, s49
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a14, s50
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a15, s51
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, s1
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v32, s1
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
|
||||
; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
|
||||
; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
|
||||
; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
|
||||
store <16 x float> %result, ptr addrspace(1) %ptr, align 64
|
||||
@ -4639,91 +4623,75 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
|
||||
; SDAG-NEXT: s_movk_i32 s2, 0x41
|
||||
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s15
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a0, s36
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s23
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a1, s37
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a2, s38
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a3, s39
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a4, s40
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a5, s41
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a6, s42
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a7, s43
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a8, s44
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a9, s45
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a10, s46
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a11, s47
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a12, s48
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a13, s49
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a14, s50
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a15, s51
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v22, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v23, s15
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v28, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v29, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v30, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v31, s23
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
|
||||
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
|
||||
; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
|
||||
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a0, s36
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a1, s37
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a2, s38
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a3, s39
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a4, s40
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a5, s41
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a6, s42
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a7, s43
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a8, s44
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a9, s45
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a10, s46
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a11, s47
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a12, s48
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a13, s49
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a14, s50
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a15, s51
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
|
||||
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: s_nop 2
|
||||
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
|
||||
; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
|
||||
; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
|
||||
; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2)
|
||||
store <16 x float> %result, ptr addrspace(1) %ptr, align 64
|
||||
@ -5031,77 +4999,72 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s23
|
||||
; SDAG-NEXT: v_mov_b32_e32 v32, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v33, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v34, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v35, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v36, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v37, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v38, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v39, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v40, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v41, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v42, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v43, s23
|
||||
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s24
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s25
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s26
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s27
|
||||
; SDAG-NEXT: v_mov_b32_e32 v44, s24
|
||||
; SDAG-NEXT: v_mov_b32_e32 v45, s25
|
||||
; SDAG-NEXT: v_mov_b32_e32 v46, s26
|
||||
; SDAG-NEXT: v_mov_b32_e32 v47, s27
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s23
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
|
||||
; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
|
||||
; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 6
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s23
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
|
||||
; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s17
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
|
||||
; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s19
|
||||
; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s13
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
|
||||
; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s15
|
||||
; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s9
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
|
||||
; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s11
|
||||
; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
|
||||
; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
|
||||
; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
|
||||
; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
|
||||
; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
@ -5109,61 +5072,45 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
|
||||
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a31, s23
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a30, s22
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a29, s21
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a28, s20
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a27, s19
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a26, s18
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a25, s17
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a24, s16
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a23, s15
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a22, s14
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a21, s13
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a20, s12
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a19, s11
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a18, s10
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a17, s9
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a16, s8
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
|
||||
; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
|
||||
; GISEL-NEXT: s_nop 1
|
||||
; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
|
||||
; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
|
||||
; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
|
||||
; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
|
||||
; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: s_nop 3
|
||||
; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
|
||||
; GISEL-NEXT: s_nop 7
|
||||
; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
|
||||
; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
|
||||
; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
|
||||
; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
|
||||
@ -5177,77 +5124,70 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s23
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v22, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v23, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, s23
|
||||
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s24
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s25
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s26
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s27
|
||||
; SDAG-NEXT: v_mov_b32_e32 v28, s24
|
||||
; SDAG-NEXT: v_mov_b32_e32 v29, s25
|
||||
; SDAG-NEXT: v_mov_b32_e32 v30, s26
|
||||
; SDAG-NEXT: v_mov_b32_e32 v31, s27
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
|
||||
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s23
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
|
||||
; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
|
||||
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s20
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s21
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s22
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s23
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
|
||||
; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s19
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s17
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
|
||||
; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s17
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s18
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s19
|
||||
; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s13
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
|
||||
; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s15
|
||||
; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s9
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
|
||||
; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s11
|
||||
; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
|
||||
; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
|
||||
; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
|
||||
; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
|
||||
; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
@ -5255,61 +5195,53 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
|
||||
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
|
||||
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
|
||||
; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
|
||||
; GISEL-NEXT: s_nop 0
|
||||
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
|
||||
; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
|
||||
; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
|
||||
; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
|
||||
; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: s_nop 3
|
||||
; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
|
||||
; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
|
||||
; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
|
||||
; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
|
||||
; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
|
||||
@ -6298,6 +6230,6 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6
|
||||
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
|
||||
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
|
||||
|
||||
attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
|
||||
attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
|
||||
attributes #1 = { "amdgpu-flat-work-group-size"="128,128" }
|
||||
attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
|
||||
|
@ -17,24 +17,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s16
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
|
||||
@ -547,24 +547,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v12, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
|
||||
; GCN-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
||||
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
||||
; GCN-NEXT: v_mov_b32_e32 v13, s16
|
||||
; GCN-NEXT: v_mov_b32_e32 v17, s16
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
|
||||
; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
|
||||
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -855,30 +855,30 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s16
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
|
||||
@ -1032,22 +1032,22 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v28, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v29, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v22, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v23, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v22, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v23, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v28, s16
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
@ -1397,30 +1397,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s16
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
|
||||
@ -1566,30 +1566,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s16
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
|
||||
@ -1735,30 +1735,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s16
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
|
||||
@ -1904,30 +1904,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v15, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s16
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
|
||||
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
|
||||
@ -2081,22 +2081,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v28, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v29, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v22, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v23, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v22, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v23, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v28, s16
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
@ -2454,22 +2454,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v28, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v29, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v22, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v23, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v22, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v23, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v28, s16
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
@ -2827,22 +2827,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v28, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v29, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v22, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v23, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v22, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v23, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v28, s16
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
@ -3200,22 +3200,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v28, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v29, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v22, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v23, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v17, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v18, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v22, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v23, s3
|
||||
; SDAG-NEXT: v_mov_b32_e32 v28, s16
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: s_nop 2
|
||||
@ -3552,4 +3552,4 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
|
||||
ret <16 x float> %result
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
|
||||
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" }
|
||||
|
Loading…
x
Reference in New Issue
Block a user