AMDGPU: Fix not folding splat immediate into VGPR MFMA src2 (#150628)
This commit is contained in:
parent
359c04a61c
commit
5f3eea7ef2
@ -1062,9 +1062,13 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
|
||||
switch (OpTy) {
|
||||
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
|
||||
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
|
||||
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
|
||||
break;
|
||||
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
|
||||
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
|
||||
break;
|
||||
default:
|
||||
|
@ -1083,58 +1083,36 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
|
||||
; GFX90A-VGPR: ; %bb.0: ; %bb
|
||||
; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0x3ff00000
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 1.0
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
|
||||
; GFX90A-VGPR-NEXT: s_nop 0
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
; GFX90A-VGPR-NEXT: s_endpgm
|
||||
;
|
||||
; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
|
||||
; GFX942-VGPR: ; %bb.0: ; %bb
|
||||
; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0x3ff00000
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 1.0
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
; GFX942-VGPR-NEXT: s_endpgm
|
||||
bb:
|
||||
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double 1.0), i32 0, i32 0, i32 0)
|
||||
@ -1184,58 +1162,36 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
|
||||
; GFX90A-VGPR: ; %bb.0: ; %bb
|
||||
; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0xbff00000
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1.0
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
|
||||
; GFX90A-VGPR-NEXT: s_nop 0
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
; GFX90A-VGPR-NEXT: s_endpgm
|
||||
;
|
||||
; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
|
||||
; GFX942-VGPR: ; %bb.0: ; %bb
|
||||
; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0xbff00000
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1.0
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
; GFX942-VGPR-NEXT: s_endpgm
|
||||
bb:
|
||||
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double -1.0), i32 0, i32 0, i32 0)
|
||||
@ -1285,58 +1241,36 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
|
||||
; GFX90A-VGPR: ; %bb.0: ; %bb
|
||||
; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 64
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 64
|
||||
; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
|
||||
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 7
|
||||
; GFX90A-VGPR-NEXT: s_nop 1
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1]
|
||||
; GFX90A-VGPR-NEXT: s_nop 0
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
; GFX90A-VGPR-NEXT: s_endpgm
|
||||
;
|
||||
; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
|
||||
; GFX942-VGPR: ; %bb.0: ; %bb
|
||||
; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 64
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
|
||||
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 64
|
||||
; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1]
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
; GFX942-VGPR-NEXT: s_endpgm
|
||||
bb:
|
||||
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 64 to double)), i32 0, i32 0, i32 0)
|
||||
|
@ -3238,27 +3238,11 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
|
||||
;
|
||||
; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
|
||||
; GFX942-VGPR: ; %bb.0: ; %bb
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 1
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 64
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 2
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2
|
||||
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v17, v18, v[0:15] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
@ -3463,13 +3447,10 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_imm_src2_1(ptr addrspace(
|
||||
; GFX942-VGPR: ; %bb.0: ; %bb
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
|
||||
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v5, v[0:3] cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3
|
||||
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-VGPR-NEXT: s_nop 3
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
||||
@ -4483,13 +4464,10 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %ar
|
||||
; GFX942-VGPR: ; %bb.0: ; %bb
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
|
||||
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v5, v[0:3]
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, 1.0
|
||||
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-VGPR-NEXT: s_nop 2
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
||||
@ -4627,25 +4605,10 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
|
||||
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm_splat:
|
||||
; GFX942-VGPR: ; %bb.0: ; %bb
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v17, v[0:15]
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0
|
||||
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 0
|
||||
@ -4797,36 +4760,20 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
|
||||
;
|
||||
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16_imm_splat:
|
||||
; GFX942-VGPR: ; %bb.0: ; %bb
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0x3c003c00
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, v16
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 0x40004000
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x3c003c00
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v0
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v18
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0x40004000
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, 0
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15]
|
||||
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0
|
||||
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-VGPR-NEXT: s_nop 7
|
||||
; GFX942-VGPR-NEXT: s_nop 1
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v20, v[12:15], s[0:1] offset:48
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v20, v[8:11], s[0:1] offset:32
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v20, v[4:7], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v20, v[0:3], s[0:1]
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
||||
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
||||
; GFX942-VGPR-NEXT: s_endpgm
|
||||
bb:
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
|
||||
|
Loading…
x
Reference in New Issue
Block a user