Fix inreg argument, which is uniform, but using VGPR due to run out of SGPR. --------- Co-authored-by: Matt Arsenault <Matthew.Arsenault@amd.com>
5899 lines
262 KiB
LLVM
5899 lines
262 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
|
|
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN-VGPR,SDAG-VGPR %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN-VGPR,GISEL-VGPR %s
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x()
|
|
|
|
; --------------------------------------------------------------------
|
|
; llvm.amdgcn.smfmac.f32.16x16x64.f16
|
|
; --------------------------------------------------------------------
|
|
|
|
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half>, <16 x half>, <4 x float>, i32, i32 immarg, i32 immarg)
|
|
|
|
define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %arg, <8 x half> %a, <16 x half> %b, i32 %idx) #0 {
|
|
; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
|
|
; SDAG: ; %bb.0: ; %bb
|
|
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
|
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
|
|
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
|
|
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
|
|
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[8:9]
|
|
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
|
|
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[12:13]
|
|
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[14:15]
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s16
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, 0
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
|
|
; SDAG-NEXT: s_nop 7
|
|
; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
|
|
; SDAG-NEXT: s_endpgm
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
|
|
; GISEL: ; %bb.0: ; %bb
|
|
; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
|
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, s16
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-NEXT: s_nop 0
|
|
; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, 0
|
|
; GISEL-NEXT: s_nop 6
|
|
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7]
|
|
; GISEL-NEXT: s_endpgm
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
|
|
; SDAG-VGPR: ; %bb.0: ; %bb
|
|
; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
|
; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
|
|
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
|
|
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
|
|
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[8:9]
|
|
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
|
|
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[12:13]
|
|
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[14:15]
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, 0
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
|
|
; SDAG-VGPR-NEXT: s_nop 7
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
|
|
; SDAG-VGPR-NEXT: s_endpgm
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
|
|
; GISEL-VGPR: ; %bb.0: ; %bb
|
|
; GISEL-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; GISEL-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
|
; GISEL-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s16
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_nop 0
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
|
|
; GISEL-VGPR-NEXT: s_nop 6
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7]
|
|
; GISEL-VGPR-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
|
|
%in.1 = load <4 x float>, ptr addrspace(1) %gep
|
|
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %a, <16 x half> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
|
|
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
|
ret void
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x64_f16:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_f16:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v17, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s16
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s17
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s18
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s19
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s20
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s21
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, s22
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s23
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-NEXT: s_nop 1
|
|
; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v4
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, s24
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, s25
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, s26
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, s27
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, s28
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
|
|
; SDAG-VGPR: ; %bb.0:
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-VGPR-NEXT: s_nop 1
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v4
|
|
; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
|
|
; GISEL-VGPR: ; %bb.0:
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28
|
|
; GISEL-VGPR-NEXT: s_nop 1
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16
|
|
; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
; --------------------------------------------------------------------
|
|
; llvm.amdgcn.smfmac.f32.32x32x32.f16
|
|
; --------------------------------------------------------------------
|
|
|
|
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half>, <16 x half>, <16 x float>, i32, i32 immarg, i32 immarg)
|
|
|
|
define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %arg, <8 x half> %a, <16 x half> %b, i32 %idx) #0 {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__vgpr:
|
|
; SDAG: ; %bb.0: ; %bb
|
|
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
|
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
|
|
; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
|
|
; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
|
|
; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
|
|
; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
|
|
; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[8:9]
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s16
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-NEXT: s_nop 0
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
|
; SDAG-NEXT: s_nop 10
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; SDAG-NEXT: s_endpgm
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__vgpr:
|
|
; GISEL: ; %bb.0: ; %bb
|
|
; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
|
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
|
|
; GISEL-NEXT: v_mov_b32_e32 v28, s16
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-NEXT: s_nop 0
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
|
; GISEL-NEXT: s_nop 10
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; GISEL-NEXT: s_endpgm
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__vgpr:
|
|
; SDAG-VGPR: ; %bb.0: ; %bb
|
|
; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
|
; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
|
|
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
|
|
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
|
|
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
|
|
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
|
|
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[8:9]
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s16
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-VGPR-NEXT: s_nop 0
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
|
; SDAG-VGPR-NEXT: s_nop 10
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; SDAG-VGPR-NEXT: s_endpgm
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__vgpr:
|
|
; GISEL-VGPR: ; %bb.0: ; %bb
|
|
; GISEL-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; GISEL-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
|
; GISEL-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s16
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_nop 0
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
|
; GISEL-VGPR-NEXT: s_nop 10
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; GISEL-VGPR-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
|
|
%in.1 = load <16 x float>, ptr addrspace(1) %gep
|
|
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %a, <16 x half> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
|
|
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
|
ret void
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x32_f16:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_f16:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_mov_b32_e32 v26, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v27, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v28, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v29, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, v10
|
|
; SDAG-NEXT: v_readfirstlane_b32 s4, v9
|
|
; SDAG-NEXT: v_readfirstlane_b32 s5, v8
|
|
; SDAG-NEXT: v_readfirstlane_b32 s6, v7
|
|
; SDAG-NEXT: v_readfirstlane_b32 s7, v6
|
|
; SDAG-NEXT: v_readfirstlane_b32 s8, v5
|
|
; SDAG-NEXT: v_readfirstlane_b32 s9, v4
|
|
; SDAG-NEXT: v_readfirstlane_b32 s10, v3
|
|
; SDAG-NEXT: v_readfirstlane_b32 s11, v2
|
|
; SDAG-NEXT: v_readfirstlane_b32 s12, v1
|
|
; SDAG-NEXT: v_readfirstlane_b32 s13, v0
|
|
; SDAG-NEXT: v_readfirstlane_b32 s0, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, s29
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, s7
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s6
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, s5
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, s4
|
|
; SDAG-NEXT: v_mov_b32_e32 v18, s16
|
|
; SDAG-NEXT: v_mov_b32_e32 v19, s17
|
|
; SDAG-NEXT: v_mov_b32_e32 v20, s18
|
|
; SDAG-NEXT: v_mov_b32_e32 v21, s19
|
|
; SDAG-NEXT: v_mov_b32_e32 v22, s20
|
|
; SDAG-NEXT: v_mov_b32_e32 v23, s21
|
|
; SDAG-NEXT: v_mov_b32_e32 v24, s22
|
|
; SDAG-NEXT: v_mov_b32_e32 v25, s23
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s0
|
|
; SDAG-NEXT: s_nop 1
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
|
; GISEL-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GISEL-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GISEL-NEXT: v_readfirstlane_b32 s6, v2
|
|
; GISEL-NEXT: v_readfirstlane_b32 s7, v3
|
|
; GISEL-NEXT: v_readfirstlane_b32 s8, v4
|
|
; GISEL-NEXT: v_readfirstlane_b32 s9, v5
|
|
; GISEL-NEXT: v_readfirstlane_b32 s10, v6
|
|
; GISEL-NEXT: v_readfirstlane_b32 s11, v7
|
|
; GISEL-NEXT: v_readfirstlane_b32 s12, v8
|
|
; GISEL-NEXT: v_readfirstlane_b32 s13, v9
|
|
; GISEL-NEXT: v_readfirstlane_b32 s14, v10
|
|
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, s24
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, s25
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, s26
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, s27
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, s28
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, s29
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, s4
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, s5
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, s6
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, s7
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, s8
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, s9
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, s10
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, s11
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, s12
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, s13
|
|
; GISEL-NEXT: v_mov_b32_e32 v28, s14
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
|
|
; SDAG-VGPR: ; %bb.0:
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s4, v9
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s5, v8
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s6, v7
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s7, v6
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s8, v5
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s9, v4
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s10, v3
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s11, v2
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s12, v1
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s13, v0
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s0, v16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s7
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s6
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s5
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s4
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s0
|
|
; SDAG-VGPR-NEXT: s_nop 1
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16
|
|
; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
|
|
; GISEL-VGPR: ; %bb.0:
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_xor_saveexec_b64 s[4:5], -1
|
|
; GISEL-VGPR-NEXT: scratch_store_dword off, v29, s32 ; 4-byte Folded Spill
|
|
; GISEL-VGPR-NEXT: s_mov_b64 exec, s[4:5]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s36, 0
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s37, 1
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s38, 2
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s39, 3
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s48, 4
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s49, 5
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s50, 6
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s36, s24
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s37, s25
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s38, s26
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s39, s27
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s40, s28
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s41, s29
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s42, v0
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s43, v1
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s44, v2
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s45, v3
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s46, v4
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s47, v5
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s48, v6
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s49, v7
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s50, v8
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s51, v9
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s0, v10
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s0
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s51, v29, 7
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s50, v29, 6
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s49, v29, 5
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s48, v29, 4
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s39, v29, 3
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s38, v29, 2
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s37, v29, 1
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s36, v29, 0
|
|
; GISEL-VGPR-NEXT: s_xor_saveexec_b64 s[0:1], -1
|
|
; GISEL-VGPR-NEXT: scratch_load_dword v29, off, s32 ; 4-byte Folded Reload
|
|
; GISEL-VGPR-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
; --------------------------------------------------------------------
|
|
; llvm.amdgcn.smfmac.f32.16x16x64.bf16
|
|
; --------------------------------------------------------------------
|
|
|
|
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat>, <16 x bfloat>, <4 x float>, i32, i32 immarg, i32 immarg)
|
|
|
|
define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr:
|
|
; GCN: ; %bb.0: ; %bb
|
|
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
|
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
|
|
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
|
|
; GCN-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
|
|
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[8:9]
|
|
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
|
|
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[12:13]
|
|
; GCN-NEXT: v_mov_b64_e32 v[8:9], s[14:15]
|
|
; GCN-NEXT: v_mov_b32_e32 v1, s16
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
|
|
; GCN-NEXT: s_endpgm
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr:
|
|
; GCN-VGPR: ; %bb.0: ; %bb
|
|
; GCN-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; GCN-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; GCN-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GCN-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GCN-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
|
; GCN-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; GCN-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
|
|
; GCN-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
|
|
; GCN-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
|
|
; GCN-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[8:9]
|
|
; GCN-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
|
|
; GCN-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[12:13]
|
|
; GCN-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[14:15]
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, s16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, 0
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
|
|
; GCN-VGPR-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
|
|
%in.1 = load <4 x float>, ptr addrspace(1) %gep
|
|
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
|
|
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
|
ret void
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v14, s0
|
|
; GCN-NEXT: v_mov_b32_e32 v15, s1
|
|
; GCN-NEXT: v_mov_b32_e32 v16, s2
|
|
; GCN-NEXT: v_mov_b32_e32 v17, s3
|
|
; GCN-NEXT: v_mov_b32_e32 v6, s16
|
|
; GCN-NEXT: v_mov_b32_e32 v7, s17
|
|
; GCN-NEXT: v_mov_b32_e32 v8, s18
|
|
; GCN-NEXT: v_mov_b32_e32 v9, s19
|
|
; GCN-NEXT: v_mov_b32_e32 v10, s20
|
|
; GCN-NEXT: v_mov_b32_e32 v11, s21
|
|
; GCN-NEXT: v_mov_b32_e32 v12, s22
|
|
; GCN-NEXT: v_mov_b32_e32 v13, s23
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s24
|
|
; GCN-NEXT: v_mov_b32_e32 v1, s25
|
|
; GCN-NEXT: v_mov_b32_e32 v2, s26
|
|
; GCN-NEXT: v_mov_b32_e32 v3, s27
|
|
; GCN-NEXT: v_mov_b32_e32 v4, s28
|
|
; GCN-NEXT: s_nop 1
|
|
; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v4
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, s0
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, s1
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v16, s2
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v17, s3
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, s16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, s17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, s18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, s19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, s20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, s21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, s22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, s23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, s24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, s25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, s26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, s27
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, s28
|
|
; GCN-VGPR-NEXT: s_nop 1
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v4
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
; --------------------------------------------------------------------
|
|
; llvm.amdgcn.smfmac.f32.32x32x32.bf16
|
|
; --------------------------------------------------------------------
|
|
|
|
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat>, <16 x bfloat>, <16 x float>, i32, i32 immarg, i32 immarg)
|
|
|
|
define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 {
|
|
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
|
|
; GCN: ; %bb.0: ; %bb
|
|
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GCN-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
|
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; GCN-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
|
|
; GCN-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
|
|
; GCN-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
|
|
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
|
|
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
|
|
; GCN-NEXT: v_mov_b64_e32 v[18:19], s[8:9]
|
|
; GCN-NEXT: v_mov_b32_e32 v16, s16
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: s_nop 0
|
|
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
|
|
; GCN-NEXT: v_mov_b32_e32 v16, 0
|
|
; GCN-NEXT: s_nop 10
|
|
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; GCN-NEXT: s_endpgm
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
|
|
; GCN-VGPR: ; %bb.0: ; %bb
|
|
; GCN-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; GCN-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; GCN-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GCN-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; GCN-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
|
; GCN-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; GCN-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; GCN-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; GCN-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; GCN-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; GCN-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
|
|
; GCN-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
|
|
; GCN-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
|
|
; GCN-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
|
|
; GCN-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
|
|
; GCN-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[8:9]
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v16, s16
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-VGPR-NEXT: s_nop 0
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
|
; GCN-VGPR-NEXT: s_nop 10
|
|
; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; GCN-VGPR-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
|
|
%in.1 = load <16 x float>, ptr addrspace(1) %gep
|
|
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
|
|
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
|
ret void
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
|
|
; GCN-NEXT: s_nop 11
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
|
; GCN-NEXT: s_nop 11
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
|
; GCN-NEXT: s_nop 11
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v26, s0
|
|
; GCN-NEXT: v_mov_b32_e32 v27, s1
|
|
; GCN-NEXT: v_mov_b32_e32 v28, s2
|
|
; GCN-NEXT: v_mov_b32_e32 v29, s3
|
|
; GCN-NEXT: v_mov_b32_e32 v16, v10
|
|
; GCN-NEXT: v_readfirstlane_b32 s4, v9
|
|
; GCN-NEXT: v_readfirstlane_b32 s5, v8
|
|
; GCN-NEXT: v_readfirstlane_b32 s6, v7
|
|
; GCN-NEXT: v_readfirstlane_b32 s7, v6
|
|
; GCN-NEXT: v_readfirstlane_b32 s8, v5
|
|
; GCN-NEXT: v_readfirstlane_b32 s9, v4
|
|
; GCN-NEXT: v_readfirstlane_b32 s10, v3
|
|
; GCN-NEXT: v_readfirstlane_b32 s11, v2
|
|
; GCN-NEXT: v_readfirstlane_b32 s12, v1
|
|
; GCN-NEXT: v_readfirstlane_b32 s13, v0
|
|
; GCN-NEXT: v_readfirstlane_b32 s0, v16
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s24
|
|
; GCN-NEXT: v_mov_b32_e32 v1, s25
|
|
; GCN-NEXT: v_mov_b32_e32 v2, s26
|
|
; GCN-NEXT: v_mov_b32_e32 v3, s27
|
|
; GCN-NEXT: v_mov_b32_e32 v4, s28
|
|
; GCN-NEXT: v_mov_b32_e32 v5, s29
|
|
; GCN-NEXT: v_mov_b32_e32 v6, s13
|
|
; GCN-NEXT: v_mov_b32_e32 v7, s12
|
|
; GCN-NEXT: v_mov_b32_e32 v8, s11
|
|
; GCN-NEXT: v_mov_b32_e32 v9, s10
|
|
; GCN-NEXT: v_mov_b32_e32 v10, s9
|
|
; GCN-NEXT: v_mov_b32_e32 v11, s8
|
|
; GCN-NEXT: v_mov_b32_e32 v12, s7
|
|
; GCN-NEXT: v_mov_b32_e32 v13, s6
|
|
; GCN-NEXT: v_mov_b32_e32 v14, s5
|
|
; GCN-NEXT: v_mov_b32_e32 v15, s4
|
|
; GCN-NEXT: v_mov_b32_e32 v18, s16
|
|
; GCN-NEXT: v_mov_b32_e32 v19, s17
|
|
; GCN-NEXT: v_mov_b32_e32 v20, s18
|
|
; GCN-NEXT: v_mov_b32_e32 v21, s19
|
|
; GCN-NEXT: v_mov_b32_e32 v22, s20
|
|
; GCN-NEXT: v_mov_b32_e32 v23, s21
|
|
; GCN-NEXT: v_mov_b32_e32 v24, s22
|
|
; GCN-NEXT: v_mov_b32_e32 v25, s23
|
|
; GCN-NEXT: v_mov_b32_e32 v16, s0
|
|
; GCN-NEXT: s_nop 1
|
|
; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v26, s0
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v27, s1
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v28, s2
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v29, s3
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v16, v10
|
|
; GCN-VGPR-NEXT: v_readfirstlane_b32 s4, v9
|
|
; GCN-VGPR-NEXT: v_readfirstlane_b32 s5, v8
|
|
; GCN-VGPR-NEXT: v_readfirstlane_b32 s6, v7
|
|
; GCN-VGPR-NEXT: v_readfirstlane_b32 s7, v6
|
|
; GCN-VGPR-NEXT: v_readfirstlane_b32 s8, v5
|
|
; GCN-VGPR-NEXT: v_readfirstlane_b32 s9, v4
|
|
; GCN-VGPR-NEXT: v_readfirstlane_b32 s10, v3
|
|
; GCN-VGPR-NEXT: v_readfirstlane_b32 s11, v2
|
|
; GCN-VGPR-NEXT: v_readfirstlane_b32 s12, v1
|
|
; GCN-VGPR-NEXT: v_readfirstlane_b32 s13, v0
|
|
; GCN-VGPR-NEXT: v_readfirstlane_b32 s0, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, s24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, s25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, s26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, s27
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, s28
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, s29
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, s13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, s12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, s11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, s10
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, s9
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, s8
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, s7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, s6
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, s5
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, s4
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v18, s16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v19, s17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v20, s18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v21, s19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v22, s20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v23, s21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v24, s22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v25, s23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v16, s0
|
|
; GCN-VGPR-NEXT: s_nop 1
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
; --------------------------------------------------------------------
|
|
; llvm.amdgcn.smfmac.i32.16x16x128.i8
|
|
; --------------------------------------------------------------------
|
|
|
|
declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32)
|
|
|
|
define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
|
|
; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
|
|
; SDAG: ; %bb.0: ; %bb
|
|
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, 0
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s16
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-NEXT: s_nop 0
|
|
; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
|
|
; SDAG-NEXT: s_nop 7
|
|
; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
|
|
; SDAG-NEXT: s_endpgm
|
|
;
|
|
; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
|
|
; GISEL: ; %bb.0: ; %bb
|
|
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, s2
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-NEXT: s_nop 0
|
|
; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, 0
|
|
; GISEL-NEXT: s_nop 6
|
|
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
|
|
; GISEL-NEXT: s_endpgm
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
|
|
; SDAG-VGPR: ; %bb.0: ; %bb
|
|
; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, 0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s16
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-VGPR-NEXT: s_nop 0
|
|
; SDAG-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
|
|
; SDAG-VGPR-NEXT: s_nop 7
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
|
|
; SDAG-VGPR-NEXT: s_endpgm
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
|
|
; GISEL-VGPR: ; %bb.0: ; %bb
|
|
; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s2
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_nop 0
|
|
; GISEL-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
|
|
; GISEL-VGPR-NEXT: s_nop 6
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
|
|
; GISEL-VGPR-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr <4 x i32>, ptr addrspace(1) %arg, i32 %id
|
|
%in.1 = load <4 x i32>, ptr addrspace(1) %gep
|
|
%mai.1 = tail call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %a, <8 x i32> %b, <4 x i32> %in.1, i32 %idx, i32 1, i32 2)
|
|
store <4 x i32> %mai.1, ptr addrspace(1) %arg
|
|
ret void
|
|
}
|
|
|
|
define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_i32_16x16x128_i8:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_i32_16x16x128_i8:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <4 x i32> %result
|
|
}
|
|
|
|
define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
|
|
ret <4 x i32> %result
|
|
}
|
|
|
|
define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
|
|
ret <4 x i32> %result
|
|
}
|
|
|
|
define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x i32> inreg %arg2, i32 inreg %arg3) {
|
|
; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v17, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s16
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s17
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s18
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s19
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s20
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s21
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, s22
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s23
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-NEXT: s_nop 1
|
|
; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[14:17], v[6:13], v4
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, s24
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, s25
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, s26
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, s27
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, s28
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
|
|
; SDAG-VGPR: ; %bb.0:
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-VGPR-NEXT: s_nop 1
|
|
; SDAG-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[14:17], v[6:13], v4
|
|
; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
|
|
; GISEL-VGPR: ; %bb.0:
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28
|
|
; GISEL-VGPR-NEXT: s_nop 1
|
|
; GISEL-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16
|
|
; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <4 x i32> %result
|
|
}
|
|
|
|
; --------------------------------------------------------------------
|
|
; llvm.amdgcn.smfmac.i32.32x32x64.i8
|
|
; --------------------------------------------------------------------
|
|
|
|
declare <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32>, <8 x i32>, <16 x i32>, i32, i32, i32)
|
|
|
|
define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
|
|
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
|
|
; SDAG: ; %bb.0: ; %bb
|
|
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v27, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v17, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v18, s14
|
|
; SDAG-NEXT: v_mov_b32_e32 v19, s15
|
|
; SDAG-NEXT: v_mov_b32_e32 v20, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v21, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v22, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v23, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v28, s16
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-NEXT: s_nop 0
|
|
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
|
; SDAG-NEXT: s_nop 10
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; SDAG-NEXT: s_endpgm
|
|
;
|
|
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
|
|
; GISEL: ; %bb.0: ; %bb
|
|
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
|
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
|
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
|
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
|
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
|
|
; GISEL-NEXT: v_mov_b32_e32 v28, s2
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-NEXT: s_nop 0
|
|
; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
|
; GISEL-NEXT: s_nop 10
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
|
; GISEL-NEXT: s_endpgm
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
|
|
; SDAG-VGPR: ; %bb.0: ; %bb
|
|
; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-VGPR-NEXT: s_nop 0
|
|
; SDAG-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
|
; SDAG-VGPR-NEXT: s_nop 10
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; SDAG-VGPR-NEXT: s_endpgm
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
|
|
; GISEL-VGPR: ; %bb.0: ; %bb
|
|
; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_nop 0
|
|
; GISEL-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
|
; GISEL-VGPR-NEXT: s_nop 10
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
|
; GISEL-VGPR-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr <16 x i32>, ptr addrspace(1) %arg, i32 %id
|
|
%in.1 = load <16 x i32>, ptr addrspace(1) %gep
|
|
%mai.1 = tail call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %a, <8 x i32> %b, <16 x i32> %in.1, i32 %idx, i32 1, i32 2)
|
|
store <16 x i32> %mai.1, ptr addrspace(1) %arg
|
|
ret void
|
|
}
|
|
|
|
define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_i32_32x32x64_i8:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <16 x i32> %result
|
|
}
|
|
|
|
define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
|
|
ret <16 x i32> %result
|
|
}
|
|
|
|
define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
|
|
ret <16 x i32> %result
|
|
}
|
|
|
|
define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x i32> inreg %arg2, i32 inreg %arg3) {
|
|
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_mov_b32_e32 v26, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v27, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v28, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v29, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, v10
|
|
; SDAG-NEXT: v_readfirstlane_b32 s4, v9
|
|
; SDAG-NEXT: v_readfirstlane_b32 s5, v8
|
|
; SDAG-NEXT: v_readfirstlane_b32 s6, v7
|
|
; SDAG-NEXT: v_readfirstlane_b32 s7, v6
|
|
; SDAG-NEXT: v_readfirstlane_b32 s8, v5
|
|
; SDAG-NEXT: v_readfirstlane_b32 s9, v4
|
|
; SDAG-NEXT: v_readfirstlane_b32 s10, v3
|
|
; SDAG-NEXT: v_readfirstlane_b32 s11, v2
|
|
; SDAG-NEXT: v_readfirstlane_b32 s12, v1
|
|
; SDAG-NEXT: v_readfirstlane_b32 s13, v0
|
|
; SDAG-NEXT: v_readfirstlane_b32 s0, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, s29
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, s7
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s6
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, s5
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, s4
|
|
; SDAG-NEXT: v_mov_b32_e32 v18, s16
|
|
; SDAG-NEXT: v_mov_b32_e32 v19, s17
|
|
; SDAG-NEXT: v_mov_b32_e32 v20, s18
|
|
; SDAG-NEXT: v_mov_b32_e32 v21, s19
|
|
; SDAG-NEXT: v_mov_b32_e32 v22, s20
|
|
; SDAG-NEXT: v_mov_b32_e32 v23, s21
|
|
; SDAG-NEXT: v_mov_b32_e32 v24, s22
|
|
; SDAG-NEXT: v_mov_b32_e32 v25, s23
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s0
|
|
; SDAG-NEXT: s_nop 1
|
|
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
|
; GISEL-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GISEL-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GISEL-NEXT: v_readfirstlane_b32 s6, v2
|
|
; GISEL-NEXT: v_readfirstlane_b32 s7, v3
|
|
; GISEL-NEXT: v_readfirstlane_b32 s8, v4
|
|
; GISEL-NEXT: v_readfirstlane_b32 s9, v5
|
|
; GISEL-NEXT: v_readfirstlane_b32 s10, v6
|
|
; GISEL-NEXT: v_readfirstlane_b32 s11, v7
|
|
; GISEL-NEXT: v_readfirstlane_b32 s12, v8
|
|
; GISEL-NEXT: v_readfirstlane_b32 s13, v9
|
|
; GISEL-NEXT: v_readfirstlane_b32 s14, v10
|
|
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, s24
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, s25
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, s26
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, s27
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, s28
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, s29
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, s4
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, s5
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, s6
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, s7
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, s8
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, s9
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, s10
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, s11
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, s12
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, s13
|
|
; GISEL-NEXT: v_mov_b32_e32 v28, s14
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
|
|
; SDAG-VGPR: ; %bb.0:
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s4, v9
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s5, v8
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s6, v7
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s7, v6
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s8, v5
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s9, v4
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s10, v3
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s11, v2
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s12, v1
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s13, v0
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s0, v16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s7
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s6
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s5
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s4
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s0
|
|
; SDAG-VGPR-NEXT: s_nop 1
|
|
; SDAG-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16
|
|
; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
|
|
; GISEL-VGPR: ; %bb.0:
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_xor_saveexec_b64 s[4:5], -1
|
|
; GISEL-VGPR-NEXT: scratch_store_dword off, v29, s32 ; 4-byte Folded Spill
|
|
; GISEL-VGPR-NEXT: s_mov_b64 exec, s[4:5]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s36, 0
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s37, 1
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s38, 2
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s39, 3
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s48, 4
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s49, 5
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s50, 6
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s36, s24
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s37, s25
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s38, s26
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s39, s27
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s40, s28
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s41, s29
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s42, v0
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s43, v1
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s44, v2
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s45, v3
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s46, v4
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s47, v5
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s48, v6
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s49, v7
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s50, v8
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s51, v9
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s0, v10
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s0
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s51, v29, 7
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s50, v29, 6
|
|
; GISEL-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s49, v29, 5
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s48, v29, 4
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s39, v29, 3
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s38, v29, 2
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s37, v29, 1
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s36, v29, 0
|
|
; GISEL-VGPR-NEXT: s_xor_saveexec_b64 s[0:1], -1
|
|
; GISEL-VGPR-NEXT: scratch_load_dword v29, off, s32 ; 4-byte Folded Reload
|
|
; GISEL-VGPR-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <16 x i32> %result
|
|
}
|
|
|
|
; --------------------------------------------------------------------
|
|
; llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8
|
|
; --------------------------------------------------------------------
|
|
|
|
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)
|
|
|
|
define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
|
|
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
|
|
; SDAG: ; %bb.0: ; %bb
|
|
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, 0
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s16
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-NEXT: s_nop 0
|
|
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
|
|
; SDAG-NEXT: s_nop 7
|
|
; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
|
|
; SDAG-NEXT: s_endpgm
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
|
|
; GISEL: ; %bb.0: ; %bb
|
|
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, s2
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-NEXT: s_nop 0
|
|
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, 0
|
|
; GISEL-NEXT: s_nop 6
|
|
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
|
|
; GISEL-NEXT: s_endpgm
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
|
|
; SDAG-VGPR: ; %bb.0: ; %bb
|
|
; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, 0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s16
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-VGPR-NEXT: s_nop 0
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
|
|
; SDAG-VGPR-NEXT: s_nop 7
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
|
|
; SDAG-VGPR-NEXT: s_endpgm
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
|
|
; GISEL-VGPR: ; %bb.0: ; %bb
|
|
; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s2
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_nop 0
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
|
|
; GISEL-VGPR-NEXT: s_nop 6
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
|
|
; GISEL-VGPR-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
|
|
%in.1 = load <4 x float>, ptr addrspace(1) %gep
|
|
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
|
|
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
|
ret void
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v17, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s16
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s17
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s18
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s19
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s20
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s21
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, s22
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s23
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-NEXT: s_nop 1
|
|
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[14:17], v[6:13], v4
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, s24
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, s25
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, s26
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, s27
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, s28
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
|
|
; SDAG-VGPR: ; %bb.0:
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-VGPR-NEXT: s_nop 1
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[14:17], v[6:13], v4
|
|
; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
|
|
; GISEL-VGPR: ; %bb.0:
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28
|
|
; GISEL-VGPR-NEXT: s_nop 1
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16
|
|
; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
; --------------------------------------------------------------------
|
|
; llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8
|
|
; --------------------------------------------------------------------
|
|
|
|
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)
|
|
|
|
define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
|
|
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
|
|
; SDAG: ; %bb.0: ; %bb
|
|
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, 0
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s16
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-NEXT: s_nop 0
|
|
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
|
|
; SDAG-NEXT: s_nop 7
|
|
; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
|
|
; SDAG-NEXT: s_endpgm
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
|
|
; GISEL: ; %bb.0: ; %bb
|
|
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, s2
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-NEXT: s_nop 0
|
|
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, 0
|
|
; GISEL-NEXT: s_nop 6
|
|
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
|
|
; GISEL-NEXT: s_endpgm
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
|
|
; SDAG-VGPR: ; %bb.0: ; %bb
|
|
; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, 0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s16
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-VGPR-NEXT: s_nop 0
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
|
|
; SDAG-VGPR-NEXT: s_nop 7
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
|
|
; SDAG-VGPR-NEXT: s_endpgm
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
|
|
; GISEL-VGPR: ; %bb.0: ; %bb
|
|
; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s2
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_nop 0
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
|
|
; GISEL-VGPR-NEXT: s_nop 6
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
|
|
; GISEL-VGPR-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
|
|
%in.1 = load <4 x float>, ptr addrspace(1) %gep
|
|
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
|
|
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
|
ret void
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v17, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s16
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s17
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s18
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s19
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s20
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s21
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, s22
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s23
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-NEXT: s_nop 1
|
|
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[14:17], v[6:13], v4
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, s24
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, s25
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, s26
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, s27
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, s28
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
|
|
; SDAG-VGPR: ; %bb.0:
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-VGPR-NEXT: s_nop 1
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[14:17], v[6:13], v4
|
|
; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
|
|
; GISEL-VGPR: ; %bb.0:
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28
|
|
; GISEL-VGPR-NEXT: s_nop 1
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16
|
|
; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
; --------------------------------------------------------------------
|
|
; llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8
|
|
; --------------------------------------------------------------------
|
|
|
|
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)
|
|
|
|
define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
|
|
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
|
|
; SDAG: ; %bb.0: ; %bb
|
|
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, 0
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s16
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-NEXT: s_nop 0
|
|
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
|
|
; SDAG-NEXT: s_nop 7
|
|
; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
|
|
; SDAG-NEXT: s_endpgm
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
|
|
; GISEL: ; %bb.0: ; %bb
|
|
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, s2
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-NEXT: s_nop 0
|
|
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, 0
|
|
; GISEL-NEXT: s_nop 6
|
|
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
|
|
; GISEL-NEXT: s_endpgm
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
|
|
; SDAG-VGPR: ; %bb.0: ; %bb
|
|
; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, 0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s16
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-VGPR-NEXT: s_nop 0
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
|
|
; SDAG-VGPR-NEXT: s_nop 7
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
|
|
; SDAG-VGPR-NEXT: s_endpgm
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
|
|
; GISEL-VGPR: ; %bb.0: ; %bb
|
|
; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s2
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_nop 0
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
|
|
; GISEL-VGPR-NEXT: s_nop 6
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
|
|
; GISEL-VGPR-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
|
|
%in.1 = load <4 x float>, ptr addrspace(1) %gep
|
|
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
|
|
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
|
ret void
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v17, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s16
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s17
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s18
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s19
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s20
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s21
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, s22
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s23
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-NEXT: s_nop 1
|
|
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[14:17], v[6:13], v4
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, s24
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, s25
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, s26
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, s27
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, s28
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
|
|
; SDAG-VGPR: ; %bb.0:
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-VGPR-NEXT: s_nop 1
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[14:17], v[6:13], v4
|
|
; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
|
|
; GISEL-VGPR: ; %bb.0:
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28
|
|
; GISEL-VGPR-NEXT: s_nop 1
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16
|
|
; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
; --------------------------------------------------------------------
|
|
; llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8
|
|
; --------------------------------------------------------------------
|
|
|
|
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)
|
|
|
|
define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
|
|
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
|
|
; SDAG: ; %bb.0: ; %bb
|
|
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, 0
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s16
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-NEXT: s_nop 0
|
|
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
|
|
; SDAG-NEXT: s_nop 7
|
|
; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
|
|
; SDAG-NEXT: s_endpgm
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
|
|
; GISEL: ; %bb.0: ; %bb
|
|
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, s2
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-NEXT: s_nop 0
|
|
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, 0
|
|
; GISEL-NEXT: s_nop 6
|
|
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
|
|
; GISEL-NEXT: s_endpgm
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
|
|
; SDAG-VGPR: ; %bb.0: ; %bb
|
|
; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, 0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s16
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-VGPR-NEXT: s_nop 0
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
|
|
; SDAG-VGPR-NEXT: s_nop 7
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
|
|
; SDAG-VGPR-NEXT: s_endpgm
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
|
|
; GISEL-VGPR: ; %bb.0: ; %bb
|
|
; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s2
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_nop 0
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
|
|
; GISEL-VGPR-NEXT: s_nop 6
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
|
|
; GISEL-VGPR-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
|
|
%in.1 = load <4 x float>, ptr addrspace(1) %gep
|
|
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
|
|
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
|
ret void
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
|
|
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
|
|
; GCN-NEXT: s_nop 7
|
|
; GCN-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
|
|
; GCN-VGPR-NEXT: s_nop 7
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v17, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s16
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s17
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s18
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s19
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s20
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s21
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, s22
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s23
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-NEXT: s_nop 1
|
|
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[14:17], v[6:13], v4
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, s24
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, s25
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, s26
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, s27
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, s28
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
|
|
; SDAG-VGPR: ; %bb.0:
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-VGPR-NEXT: s_nop 1
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[14:17], v[6:13], v4
|
|
; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
|
|
; GISEL-VGPR: ; %bb.0:
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28
|
|
; GISEL-VGPR-NEXT: s_nop 1
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16
|
|
; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <4 x float> %result
|
|
}
|
|
|
|
; --------------------------------------------------------------------
|
|
; llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8
|
|
; --------------------------------------------------------------------
|
|
|
|
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32 immarg, i32 immarg)
|
|
|
|
define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
|
|
; SDAG: ; %bb.0: ; %bb
|
|
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v27, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v17, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v18, s14
|
|
; SDAG-NEXT: v_mov_b32_e32 v19, s15
|
|
; SDAG-NEXT: v_mov_b32_e32 v20, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v21, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v22, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v23, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v28, s16
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-NEXT: s_nop 0
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
|
; SDAG-NEXT: s_nop 10
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; SDAG-NEXT: s_endpgm
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
|
|
; GISEL: ; %bb.0: ; %bb
|
|
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
|
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
|
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
|
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
|
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
|
|
; GISEL-NEXT: v_mov_b32_e32 v28, s2
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-NEXT: s_nop 0
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
|
; GISEL-NEXT: s_nop 10
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
|
; GISEL-NEXT: s_endpgm
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
|
|
; SDAG-VGPR: ; %bb.0: ; %bb
|
|
; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-VGPR-NEXT: s_nop 0
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
|
; SDAG-VGPR-NEXT: s_nop 10
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; SDAG-VGPR-NEXT: s_endpgm
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
|
|
; GISEL-VGPR: ; %bb.0: ; %bb
|
|
; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_nop 0
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
|
; GISEL-VGPR-NEXT: s_nop 10
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
|
; GISEL-VGPR-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
|
|
%in.1 = load <16 x float>, ptr addrspace(1) %gep
|
|
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %a, <8 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
|
|
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
|
ret void
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_mov_b32_e32 v26, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v27, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v28, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v29, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, v10
|
|
; SDAG-NEXT: v_readfirstlane_b32 s4, v9
|
|
; SDAG-NEXT: v_readfirstlane_b32 s5, v8
|
|
; SDAG-NEXT: v_readfirstlane_b32 s6, v7
|
|
; SDAG-NEXT: v_readfirstlane_b32 s7, v6
|
|
; SDAG-NEXT: v_readfirstlane_b32 s8, v5
|
|
; SDAG-NEXT: v_readfirstlane_b32 s9, v4
|
|
; SDAG-NEXT: v_readfirstlane_b32 s10, v3
|
|
; SDAG-NEXT: v_readfirstlane_b32 s11, v2
|
|
; SDAG-NEXT: v_readfirstlane_b32 s12, v1
|
|
; SDAG-NEXT: v_readfirstlane_b32 s13, v0
|
|
; SDAG-NEXT: v_readfirstlane_b32 s0, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, s29
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, s7
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s6
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, s5
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, s4
|
|
; SDAG-NEXT: v_mov_b32_e32 v18, s16
|
|
; SDAG-NEXT: v_mov_b32_e32 v19, s17
|
|
; SDAG-NEXT: v_mov_b32_e32 v20, s18
|
|
; SDAG-NEXT: v_mov_b32_e32 v21, s19
|
|
; SDAG-NEXT: v_mov_b32_e32 v22, s20
|
|
; SDAG-NEXT: v_mov_b32_e32 v23, s21
|
|
; SDAG-NEXT: v_mov_b32_e32 v24, s22
|
|
; SDAG-NEXT: v_mov_b32_e32 v25, s23
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s0
|
|
; SDAG-NEXT: s_nop 1
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
|
; GISEL-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GISEL-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GISEL-NEXT: v_readfirstlane_b32 s6, v2
|
|
; GISEL-NEXT: v_readfirstlane_b32 s7, v3
|
|
; GISEL-NEXT: v_readfirstlane_b32 s8, v4
|
|
; GISEL-NEXT: v_readfirstlane_b32 s9, v5
|
|
; GISEL-NEXT: v_readfirstlane_b32 s10, v6
|
|
; GISEL-NEXT: v_readfirstlane_b32 s11, v7
|
|
; GISEL-NEXT: v_readfirstlane_b32 s12, v8
|
|
; GISEL-NEXT: v_readfirstlane_b32 s13, v9
|
|
; GISEL-NEXT: v_readfirstlane_b32 s14, v10
|
|
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, s24
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, s25
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, s26
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, s27
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, s28
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, s29
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, s4
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, s5
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, s6
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, s7
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, s8
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, s9
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, s10
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, s11
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, s12
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, s13
|
|
; GISEL-NEXT: v_mov_b32_e32 v28, s14
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
|
|
; SDAG-VGPR: ; %bb.0:
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s4, v9
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s5, v8
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s6, v7
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s7, v6
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s8, v5
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s9, v4
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s10, v3
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s11, v2
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s12, v1
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s13, v0
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s0, v16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s7
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s6
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s5
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s4
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s0
|
|
; SDAG-VGPR-NEXT: s_nop 1
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16
|
|
; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
|
|
; GISEL-VGPR: ; %bb.0:
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_xor_saveexec_b64 s[4:5], -1
|
|
; GISEL-VGPR-NEXT: scratch_store_dword off, v29, s32 ; 4-byte Folded Spill
|
|
; GISEL-VGPR-NEXT: s_mov_b64 exec, s[4:5]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s36, 0
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s37, 1
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s38, 2
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s39, 3
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s48, 4
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s49, 5
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s50, 6
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s36, s24
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s37, s25
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s38, s26
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s39, s27
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s40, s28
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s41, s29
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s42, v0
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s43, v1
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s44, v2
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s45, v3
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s46, v4
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s47, v5
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s48, v6
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s49, v7
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s50, v8
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s51, v9
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s0, v10
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s0
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s51, v29, 7
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s50, v29, 6
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s49, v29, 5
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s48, v29, 4
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s39, v29, 3
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s38, v29, 2
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s37, v29, 1
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s36, v29, 0
|
|
; GISEL-VGPR-NEXT: s_xor_saveexec_b64 s[0:1], -1
|
|
; GISEL-VGPR-NEXT: scratch_load_dword v29, off, s32 ; 4-byte Folded Reload
|
|
; GISEL-VGPR-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
; --------------------------------------------------------------------
|
|
; llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8
|
|
; --------------------------------------------------------------------
|
|
|
|
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32 immarg, i32 immarg)
|
|
|
|
define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
|
|
; SDAG: ; %bb.0: ; %bb
|
|
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v27, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v17, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v18, s14
|
|
; SDAG-NEXT: v_mov_b32_e32 v19, s15
|
|
; SDAG-NEXT: v_mov_b32_e32 v20, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v21, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v22, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v23, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v28, s16
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-NEXT: s_nop 0
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
|
; SDAG-NEXT: s_nop 10
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; SDAG-NEXT: s_endpgm
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
|
|
; GISEL: ; %bb.0: ; %bb
|
|
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
|
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
|
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
|
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
|
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
|
|
; GISEL-NEXT: v_mov_b32_e32 v28, s2
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-NEXT: s_nop 0
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
|
; GISEL-NEXT: s_nop 10
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
|
; GISEL-NEXT: s_endpgm
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
|
|
; SDAG-VGPR: ; %bb.0: ; %bb
|
|
; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-VGPR-NEXT: s_nop 0
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
|
; SDAG-VGPR-NEXT: s_nop 10
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; SDAG-VGPR-NEXT: s_endpgm
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
|
|
; GISEL-VGPR: ; %bb.0: ; %bb
|
|
; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_nop 0
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
|
; GISEL-VGPR-NEXT: s_nop 10
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
|
; GISEL-VGPR-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
|
|
%in.1 = load <16 x float>, ptr addrspace(1) %gep
|
|
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %a, <8 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
|
|
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
|
ret void
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_mov_b32_e32 v26, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v27, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v28, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v29, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, v10
|
|
; SDAG-NEXT: v_readfirstlane_b32 s4, v9
|
|
; SDAG-NEXT: v_readfirstlane_b32 s5, v8
|
|
; SDAG-NEXT: v_readfirstlane_b32 s6, v7
|
|
; SDAG-NEXT: v_readfirstlane_b32 s7, v6
|
|
; SDAG-NEXT: v_readfirstlane_b32 s8, v5
|
|
; SDAG-NEXT: v_readfirstlane_b32 s9, v4
|
|
; SDAG-NEXT: v_readfirstlane_b32 s10, v3
|
|
; SDAG-NEXT: v_readfirstlane_b32 s11, v2
|
|
; SDAG-NEXT: v_readfirstlane_b32 s12, v1
|
|
; SDAG-NEXT: v_readfirstlane_b32 s13, v0
|
|
; SDAG-NEXT: v_readfirstlane_b32 s0, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, s29
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, s7
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s6
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, s5
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, s4
|
|
; SDAG-NEXT: v_mov_b32_e32 v18, s16
|
|
; SDAG-NEXT: v_mov_b32_e32 v19, s17
|
|
; SDAG-NEXT: v_mov_b32_e32 v20, s18
|
|
; SDAG-NEXT: v_mov_b32_e32 v21, s19
|
|
; SDAG-NEXT: v_mov_b32_e32 v22, s20
|
|
; SDAG-NEXT: v_mov_b32_e32 v23, s21
|
|
; SDAG-NEXT: v_mov_b32_e32 v24, s22
|
|
; SDAG-NEXT: v_mov_b32_e32 v25, s23
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s0
|
|
; SDAG-NEXT: s_nop 1
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
|
; GISEL-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GISEL-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GISEL-NEXT: v_readfirstlane_b32 s6, v2
|
|
; GISEL-NEXT: v_readfirstlane_b32 s7, v3
|
|
; GISEL-NEXT: v_readfirstlane_b32 s8, v4
|
|
; GISEL-NEXT: v_readfirstlane_b32 s9, v5
|
|
; GISEL-NEXT: v_readfirstlane_b32 s10, v6
|
|
; GISEL-NEXT: v_readfirstlane_b32 s11, v7
|
|
; GISEL-NEXT: v_readfirstlane_b32 s12, v8
|
|
; GISEL-NEXT: v_readfirstlane_b32 s13, v9
|
|
; GISEL-NEXT: v_readfirstlane_b32 s14, v10
|
|
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, s24
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, s25
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, s26
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, s27
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, s28
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, s29
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, s4
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, s5
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, s6
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, s7
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, s8
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, s9
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, s10
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, s11
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, s12
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, s13
|
|
; GISEL-NEXT: v_mov_b32_e32 v28, s14
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
|
|
; SDAG-VGPR: ; %bb.0:
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s4, v9
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s5, v8
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s6, v7
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s7, v6
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s8, v5
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s9, v4
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s10, v3
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s11, v2
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s12, v1
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s13, v0
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s0, v16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s7
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s6
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s5
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s4
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s0
|
|
; SDAG-VGPR-NEXT: s_nop 1
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16
|
|
; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
|
|
; GISEL-VGPR: ; %bb.0:
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_xor_saveexec_b64 s[4:5], -1
|
|
; GISEL-VGPR-NEXT: scratch_store_dword off, v29, s32 ; 4-byte Folded Spill
|
|
; GISEL-VGPR-NEXT: s_mov_b64 exec, s[4:5]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s36, 0
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s37, 1
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s38, 2
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s39, 3
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s48, 4
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s49, 5
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s50, 6
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s36, s24
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s37, s25
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s38, s26
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s39, s27
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s40, s28
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s41, s29
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s42, v0
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s43, v1
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s44, v2
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s45, v3
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s46, v4
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s47, v5
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s48, v6
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s49, v7
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s50, v8
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s51, v9
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s0, v10
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s0
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s51, v29, 7
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s50, v29, 6
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s49, v29, 5
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s48, v29, 4
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s39, v29, 3
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s38, v29, 2
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s37, v29, 1
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s36, v29, 0
|
|
; GISEL-VGPR-NEXT: s_xor_saveexec_b64 s[0:1], -1
|
|
; GISEL-VGPR-NEXT: scratch_load_dword v29, off, s32 ; 4-byte Folded Reload
|
|
; GISEL-VGPR-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
; --------------------------------------------------------------------
|
|
; llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8
|
|
; --------------------------------------------------------------------
|
|
|
|
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32 immarg, i32 immarg)
|
|
|
|
define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
|
|
; SDAG: ; %bb.0: ; %bb
|
|
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v27, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v17, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v18, s14
|
|
; SDAG-NEXT: v_mov_b32_e32 v19, s15
|
|
; SDAG-NEXT: v_mov_b32_e32 v20, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v21, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v22, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v23, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v28, s16
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-NEXT: s_nop 0
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
|
; SDAG-NEXT: s_nop 10
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; SDAG-NEXT: s_endpgm
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
|
|
; GISEL: ; %bb.0: ; %bb
|
|
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
|
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
|
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
|
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
|
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
|
|
; GISEL-NEXT: v_mov_b32_e32 v28, s2
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-NEXT: s_nop 0
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
|
; GISEL-NEXT: s_nop 10
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
|
; GISEL-NEXT: s_endpgm
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
|
|
; SDAG-VGPR: ; %bb.0: ; %bb
|
|
; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-VGPR-NEXT: s_nop 0
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
|
; SDAG-VGPR-NEXT: s_nop 10
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; SDAG-VGPR-NEXT: s_endpgm
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
|
|
; GISEL-VGPR: ; %bb.0: ; %bb
|
|
; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_nop 0
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
|
; GISEL-VGPR-NEXT: s_nop 10
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
|
; GISEL-VGPR-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
|
|
%in.1 = load <16 x float>, ptr addrspace(1) %gep
|
|
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %a, <8 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
|
|
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
|
ret void
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_mov_b32_e32 v26, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v27, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v28, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v29, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, v10
|
|
; SDAG-NEXT: v_readfirstlane_b32 s4, v9
|
|
; SDAG-NEXT: v_readfirstlane_b32 s5, v8
|
|
; SDAG-NEXT: v_readfirstlane_b32 s6, v7
|
|
; SDAG-NEXT: v_readfirstlane_b32 s7, v6
|
|
; SDAG-NEXT: v_readfirstlane_b32 s8, v5
|
|
; SDAG-NEXT: v_readfirstlane_b32 s9, v4
|
|
; SDAG-NEXT: v_readfirstlane_b32 s10, v3
|
|
; SDAG-NEXT: v_readfirstlane_b32 s11, v2
|
|
; SDAG-NEXT: v_readfirstlane_b32 s12, v1
|
|
; SDAG-NEXT: v_readfirstlane_b32 s13, v0
|
|
; SDAG-NEXT: v_readfirstlane_b32 s0, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, s29
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, s7
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s6
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, s5
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, s4
|
|
; SDAG-NEXT: v_mov_b32_e32 v18, s16
|
|
; SDAG-NEXT: v_mov_b32_e32 v19, s17
|
|
; SDAG-NEXT: v_mov_b32_e32 v20, s18
|
|
; SDAG-NEXT: v_mov_b32_e32 v21, s19
|
|
; SDAG-NEXT: v_mov_b32_e32 v22, s20
|
|
; SDAG-NEXT: v_mov_b32_e32 v23, s21
|
|
; SDAG-NEXT: v_mov_b32_e32 v24, s22
|
|
; SDAG-NEXT: v_mov_b32_e32 v25, s23
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s0
|
|
; SDAG-NEXT: s_nop 1
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
|
; GISEL-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GISEL-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GISEL-NEXT: v_readfirstlane_b32 s6, v2
|
|
; GISEL-NEXT: v_readfirstlane_b32 s7, v3
|
|
; GISEL-NEXT: v_readfirstlane_b32 s8, v4
|
|
; GISEL-NEXT: v_readfirstlane_b32 s9, v5
|
|
; GISEL-NEXT: v_readfirstlane_b32 s10, v6
|
|
; GISEL-NEXT: v_readfirstlane_b32 s11, v7
|
|
; GISEL-NEXT: v_readfirstlane_b32 s12, v8
|
|
; GISEL-NEXT: v_readfirstlane_b32 s13, v9
|
|
; GISEL-NEXT: v_readfirstlane_b32 s14, v10
|
|
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, s24
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, s25
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, s26
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, s27
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, s28
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, s29
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, s4
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, s5
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, s6
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, s7
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, s8
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, s9
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, s10
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, s11
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, s12
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, s13
|
|
; GISEL-NEXT: v_mov_b32_e32 v28, s14
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
|
|
; SDAG-VGPR: ; %bb.0:
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s4, v9
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s5, v8
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s6, v7
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s7, v6
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s8, v5
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s9, v4
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s10, v3
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s11, v2
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s12, v1
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s13, v0
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s0, v16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s7
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s6
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s5
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s4
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s0
|
|
; SDAG-VGPR-NEXT: s_nop 1
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16
|
|
; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
|
|
; GISEL-VGPR: ; %bb.0:
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_xor_saveexec_b64 s[4:5], -1
|
|
; GISEL-VGPR-NEXT: scratch_store_dword off, v29, s32 ; 4-byte Folded Spill
|
|
; GISEL-VGPR-NEXT: s_mov_b64 exec, s[4:5]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s36, 0
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s37, 1
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s38, 2
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s39, 3
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s48, 4
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s49, 5
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s50, 6
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s36, s24
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s37, s25
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s38, s26
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s39, s27
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s40, s28
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s41, s29
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s42, v0
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s43, v1
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s44, v2
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s45, v3
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s46, v4
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s47, v5
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s48, v6
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s49, v7
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s50, v8
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s51, v9
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s0, v10
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s0
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s51, v29, 7
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s50, v29, 6
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s49, v29, 5
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s48, v29, 4
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s39, v29, 3
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s38, v29, 2
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s37, v29, 1
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s36, v29, 0
|
|
; GISEL-VGPR-NEXT: s_xor_saveexec_b64 s[0:1], -1
|
|
; GISEL-VGPR-NEXT: scratch_load_dword v29, off, s32 ; 4-byte Folded Reload
|
|
; GISEL-VGPR-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
; --------------------------------------------------------------------
|
|
; llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8
|
|
; --------------------------------------------------------------------
|
|
|
|
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32 immarg, i32 immarg)
|
|
|
|
define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
|
|
; SDAG: ; %bb.0: ; %bb
|
|
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v27, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v17, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v18, s14
|
|
; SDAG-NEXT: v_mov_b32_e32 v19, s15
|
|
; SDAG-NEXT: v_mov_b32_e32 v20, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v21, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v22, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v23, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v28, s16
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-NEXT: s_nop 0
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
|
; SDAG-NEXT: s_nop 10
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; SDAG-NEXT: s_endpgm
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
|
|
; GISEL: ; %bb.0: ; %bb
|
|
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
|
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
|
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
|
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
|
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
|
|
; GISEL-NEXT: v_mov_b32_e32 v28, s2
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-NEXT: s_nop 0
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; GISEL-NEXT: v_mov_b32_e32 v16, 0
|
|
; GISEL-NEXT: s_nop 10
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
|
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
|
; GISEL-NEXT: s_endpgm
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
|
|
; SDAG-VGPR: ; %bb.0: ; %bb
|
|
; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
|
|
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
|
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
|
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; SDAG-VGPR-NEXT: s_nop 0
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
|
; SDAG-VGPR-NEXT: s_nop 10
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
|
|
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
|
|
; SDAG-VGPR-NEXT: s_endpgm
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
|
|
; GISEL-VGPR: ; %bb.0: ; %bb
|
|
; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
|
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
|
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
|
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
|
|
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
|
; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_nop 0
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
|
|
; GISEL-VGPR-NEXT: s_nop 10
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
|
|
; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
|
|
; GISEL-VGPR-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
|
|
%in.1 = load <16 x float>, ptr addrspace(1) %gep
|
|
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %a, <8 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
|
|
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
|
ret void
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
|
; SDAG-NEXT: s_nop 11
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, v12
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, v13
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, v14
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, v15
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, v17
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, v18
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, v19
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, v20
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, v21
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, v22
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, v23
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, v24
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, v25
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, v26
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, v27
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b32_e32 v48, v0
|
|
; GISEL-NEXT: v_mov_b32_e32 v49, v1
|
|
; GISEL-NEXT: v_mov_b32_e32 v50, v2
|
|
; GISEL-NEXT: v_mov_b32_e32 v51, v3
|
|
; GISEL-NEXT: v_mov_b32_e32 v30, v4
|
|
; GISEL-NEXT: v_mov_b32_e32 v31, v5
|
|
; GISEL-NEXT: v_mov_b32_e32 v32, v6
|
|
; GISEL-NEXT: v_mov_b32_e32 v33, v7
|
|
; GISEL-NEXT: v_mov_b32_e32 v34, v8
|
|
; GISEL-NEXT: v_mov_b32_e32 v35, v9
|
|
; GISEL-NEXT: v_mov_b32_e32 v36, v10
|
|
; GISEL-NEXT: v_mov_b32_e32 v37, v11
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, v12
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, v13
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, v14
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, v15
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, v16
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, v17
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, v18
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, v19
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, v20
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, v21
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, v22
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, v23
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, v24
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, v25
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, v26
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, v27
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GCN-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
|
|
; GCN-VGPR: ; %bb.0:
|
|
; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
|
|
; GCN-VGPR-NEXT: s_nop 11
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
|
|
; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
|
|
; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
|
|
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
|
|
; SDAG: ; %bb.0:
|
|
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-NEXT: v_mov_b32_e32 v26, s0
|
|
; SDAG-NEXT: v_mov_b32_e32 v27, s1
|
|
; SDAG-NEXT: v_mov_b32_e32 v28, s2
|
|
; SDAG-NEXT: v_mov_b32_e32 v29, s3
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, v10
|
|
; SDAG-NEXT: v_readfirstlane_b32 s4, v9
|
|
; SDAG-NEXT: v_readfirstlane_b32 s5, v8
|
|
; SDAG-NEXT: v_readfirstlane_b32 s6, v7
|
|
; SDAG-NEXT: v_readfirstlane_b32 s7, v6
|
|
; SDAG-NEXT: v_readfirstlane_b32 s8, v5
|
|
; SDAG-NEXT: v_readfirstlane_b32 s9, v4
|
|
; SDAG-NEXT: v_readfirstlane_b32 s10, v3
|
|
; SDAG-NEXT: v_readfirstlane_b32 s11, v2
|
|
; SDAG-NEXT: v_readfirstlane_b32 s12, v1
|
|
; SDAG-NEXT: v_readfirstlane_b32 s13, v0
|
|
; SDAG-NEXT: v_readfirstlane_b32 s0, v16
|
|
; SDAG-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-NEXT: v_mov_b32_e32 v5, s29
|
|
; SDAG-NEXT: v_mov_b32_e32 v6, s13
|
|
; SDAG-NEXT: v_mov_b32_e32 v7, s12
|
|
; SDAG-NEXT: v_mov_b32_e32 v8, s11
|
|
; SDAG-NEXT: v_mov_b32_e32 v9, s10
|
|
; SDAG-NEXT: v_mov_b32_e32 v10, s9
|
|
; SDAG-NEXT: v_mov_b32_e32 v11, s8
|
|
; SDAG-NEXT: v_mov_b32_e32 v12, s7
|
|
; SDAG-NEXT: v_mov_b32_e32 v13, s6
|
|
; SDAG-NEXT: v_mov_b32_e32 v14, s5
|
|
; SDAG-NEXT: v_mov_b32_e32 v15, s4
|
|
; SDAG-NEXT: v_mov_b32_e32 v18, s16
|
|
; SDAG-NEXT: v_mov_b32_e32 v19, s17
|
|
; SDAG-NEXT: v_mov_b32_e32 v20, s18
|
|
; SDAG-NEXT: v_mov_b32_e32 v21, s19
|
|
; SDAG-NEXT: v_mov_b32_e32 v22, s20
|
|
; SDAG-NEXT: v_mov_b32_e32 v23, s21
|
|
; SDAG-NEXT: v_mov_b32_e32 v24, s22
|
|
; SDAG-NEXT: v_mov_b32_e32 v25, s23
|
|
; SDAG-NEXT: v_mov_b32_e32 v16, s0
|
|
; SDAG-NEXT: s_nop 1
|
|
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16
|
|
; SDAG-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
|
|
; GISEL: ; %bb.0:
|
|
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
|
; GISEL-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GISEL-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GISEL-NEXT: v_readfirstlane_b32 s6, v2
|
|
; GISEL-NEXT: v_readfirstlane_b32 s7, v3
|
|
; GISEL-NEXT: v_readfirstlane_b32 s8, v4
|
|
; GISEL-NEXT: v_readfirstlane_b32 s9, v5
|
|
; GISEL-NEXT: v_readfirstlane_b32 s10, v6
|
|
; GISEL-NEXT: v_readfirstlane_b32 s11, v7
|
|
; GISEL-NEXT: v_readfirstlane_b32 s12, v8
|
|
; GISEL-NEXT: v_readfirstlane_b32 s13, v9
|
|
; GISEL-NEXT: v_readfirstlane_b32 s14, v10
|
|
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
|
|
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
|
|
; GISEL-NEXT: v_mov_b32_e32 v0, s24
|
|
; GISEL-NEXT: v_mov_b32_e32 v1, s25
|
|
; GISEL-NEXT: v_mov_b32_e32 v2, s26
|
|
; GISEL-NEXT: v_mov_b32_e32 v3, s27
|
|
; GISEL-NEXT: v_mov_b32_e32 v4, s28
|
|
; GISEL-NEXT: v_mov_b32_e32 v5, s29
|
|
; GISEL-NEXT: v_mov_b32_e32 v6, s4
|
|
; GISEL-NEXT: v_mov_b32_e32 v7, s5
|
|
; GISEL-NEXT: v_mov_b32_e32 v8, s6
|
|
; GISEL-NEXT: v_mov_b32_e32 v9, s7
|
|
; GISEL-NEXT: v_mov_b32_e32 v10, s8
|
|
; GISEL-NEXT: v_mov_b32_e32 v11, s9
|
|
; GISEL-NEXT: v_mov_b32_e32 v12, s10
|
|
; GISEL-NEXT: v_mov_b32_e32 v13, s11
|
|
; GISEL-NEXT: v_mov_b32_e32 v14, s12
|
|
; GISEL-NEXT: v_mov_b32_e32 v15, s13
|
|
; GISEL-NEXT: v_mov_b32_e32 v28, s14
|
|
; GISEL-NEXT: s_nop 1
|
|
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28
|
|
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
|
|
; SDAG-VGPR: ; %bb.0:
|
|
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s4, v9
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s5, v8
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s6, v7
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s7, v6
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s8, v5
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s9, v4
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s10, v3
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s11, v2
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s12, v1
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s13, v0
|
|
; SDAG-VGPR-NEXT: v_readfirstlane_b32 s0, v16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s13
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s12
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s11
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s10
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s9
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s8
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s7
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s6
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s5
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s4
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23
|
|
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s0
|
|
; SDAG-VGPR-NEXT: s_nop 1
|
|
; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16
|
|
; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
|
|
; GISEL-VGPR: ; %bb.0:
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_xor_saveexec_b64 s[4:5], -1
|
|
; GISEL-VGPR-NEXT: scratch_store_dword off, v29, s32 ; 4-byte Folded Spill
|
|
; GISEL-VGPR-NEXT: s_mov_b64 exec, s[4:5]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s36, 0
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s37, 1
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s38, 2
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s39, 3
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s48, 4
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s49, 5
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s50, 6
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s36, s24
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s37, s25
|
|
; GISEL-VGPR-NEXT: v_writelane_b32 v29, s51, 7
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s38, s26
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s39, s27
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s40, s28
|
|
; GISEL-VGPR-NEXT: s_mov_b32 s41, s29
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s42, v0
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s43, v1
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s44, v2
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s45, v3
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s46, v4
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s47, v5
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s48, v6
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s49, v7
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s50, v8
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s51, v9
|
|
; GISEL-VGPR-NEXT: v_readfirstlane_b32 s0, v10
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
|
|
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
|
|
; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s0
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s51, v29, 7
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s50, v29, 6
|
|
; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s49, v29, 5
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s48, v29, 4
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s39, v29, 3
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s38, v29, 2
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s37, v29, 1
|
|
; GISEL-VGPR-NEXT: v_readlane_b32 s36, v29, 0
|
|
; GISEL-VGPR-NEXT: s_xor_saveexec_b64 s[0:1], -1
|
|
; GISEL-VGPR-NEXT: scratch_load_dword v29, off, s32 ; 4-byte Folded Reload
|
|
; GISEL-VGPR-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
|
|
; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
|
|
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
|
|
ret <16 x float> %result
|
|
}
|
|
|
|
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" }
|