llvm-project/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll

1144 lines
51 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) #0 {
; GFX908-LABEL: test_mfma_f32_32x32x1f32_vgpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX908-NEXT: v_mov_b32_e32 v4, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; GFX908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v0, s16
; GFX908-NEXT: v_mov_b32_e32 v1, s17
; GFX908-NEXT: v_mov_b32_e32 v2, s18
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s21
; GFX908-NEXT: v_mov_b32_e32 v1, s22
; GFX908-NEXT: v_mov_b32_e32 v2, s23
; GFX908-NEXT: v_accvgpr_write_b32 a5, v0
; GFX908-NEXT: v_accvgpr_write_b32 a6, v1
; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s24
; GFX908-NEXT: v_mov_b32_e32 v1, s25
; GFX908-NEXT: v_mov_b32_e32 v2, s26
; GFX908-NEXT: v_accvgpr_write_b32 a8, v0
; GFX908-NEXT: v_accvgpr_write_b32 a9, v1
; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s27
; GFX908-NEXT: v_mov_b32_e32 v1, s28
; GFX908-NEXT: v_mov_b32_e32 v2, s29
; GFX908-NEXT: v_accvgpr_write_b32 a11, v0
; GFX908-NEXT: v_accvgpr_write_b32 a12, v1
; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s30
; GFX908-NEXT: v_mov_b32_e32 v1, s31
; GFX908-NEXT: v_mov_b32_e32 v2, s0
; GFX908-NEXT: v_accvgpr_write_b32 a14, v0
; GFX908-NEXT: v_accvgpr_write_b32 a15, v1
; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s1
; GFX908-NEXT: v_mov_b32_e32 v1, s2
; GFX908-NEXT: v_mov_b32_e32 v2, s3
; GFX908-NEXT: v_accvgpr_write_b32 a17, v0
; GFX908-NEXT: v_accvgpr_write_b32 a18, v1
; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s4
; GFX908-NEXT: v_mov_b32_e32 v1, s5
; GFX908-NEXT: v_mov_b32_e32 v2, s6
; GFX908-NEXT: v_accvgpr_write_b32 a20, v0
; GFX908-NEXT: v_accvgpr_write_b32 a21, v1
; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s7
; GFX908-NEXT: v_mov_b32_e32 v1, s8
; GFX908-NEXT: v_mov_b32_e32 v2, s9
; GFX908-NEXT: v_mov_b32_e32 v3, s19
; GFX908-NEXT: v_accvgpr_write_b32 a23, v0
; GFX908-NEXT: v_accvgpr_write_b32 a24, v1
; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s10
; GFX908-NEXT: v_mov_b32_e32 v1, s11
; GFX908-NEXT: v_mov_b32_e32 v2, s12
; GFX908-NEXT: v_mov_b32_e32 v5, s20
; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
; GFX908-NEXT: v_accvgpr_write_b32 a26, v0
; GFX908-NEXT: v_accvgpr_write_b32 a27, v1
; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s13
; GFX908-NEXT: v_mov_b32_e32 v1, s14
; GFX908-NEXT: v_mov_b32_e32 v2, s15
; GFX908-NEXT: v_mov_b32_e32 v3, 1.0
; GFX908-NEXT: v_accvgpr_write_b32 a4, v5
; GFX908-NEXT: v_accvgpr_write_b32 a29, v0
; GFX908-NEXT: v_accvgpr_write_b32 a30, v1
; GFX908-NEXT: v_accvgpr_write_b32 a31, v2
; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; GFX908-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
store <32 x float> %mai.1, ptr addrspace(1) %arg
ret void
}
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) #2 {
; GFX908-LABEL: test_mfma_f32_32x32x1f32_agpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX908-NEXT: v_mov_b32_e32 v4, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; GFX908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v0, s16
; GFX908-NEXT: v_mov_b32_e32 v1, s17
; GFX908-NEXT: v_mov_b32_e32 v2, s18
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s21
; GFX908-NEXT: v_mov_b32_e32 v1, s22
; GFX908-NEXT: v_mov_b32_e32 v2, s23
; GFX908-NEXT: v_accvgpr_write_b32 a5, v0
; GFX908-NEXT: v_accvgpr_write_b32 a6, v1
; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s24
; GFX908-NEXT: v_mov_b32_e32 v1, s25
; GFX908-NEXT: v_mov_b32_e32 v2, s26
; GFX908-NEXT: v_accvgpr_write_b32 a8, v0
; GFX908-NEXT: v_accvgpr_write_b32 a9, v1
; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s27
; GFX908-NEXT: v_mov_b32_e32 v1, s28
; GFX908-NEXT: v_mov_b32_e32 v2, s29
; GFX908-NEXT: v_accvgpr_write_b32 a11, v0
; GFX908-NEXT: v_accvgpr_write_b32 a12, v1
; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s30
; GFX908-NEXT: v_mov_b32_e32 v1, s31
; GFX908-NEXT: v_mov_b32_e32 v2, s0
; GFX908-NEXT: v_accvgpr_write_b32 a14, v0
; GFX908-NEXT: v_accvgpr_write_b32 a15, v1
; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s1
; GFX908-NEXT: v_mov_b32_e32 v1, s2
; GFX908-NEXT: v_mov_b32_e32 v2, s3
; GFX908-NEXT: v_accvgpr_write_b32 a17, v0
; GFX908-NEXT: v_accvgpr_write_b32 a18, v1
; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s4
; GFX908-NEXT: v_mov_b32_e32 v1, s5
; GFX908-NEXT: v_mov_b32_e32 v2, s6
; GFX908-NEXT: v_accvgpr_write_b32 a20, v0
; GFX908-NEXT: v_accvgpr_write_b32 a21, v1
; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s7
; GFX908-NEXT: v_mov_b32_e32 v1, s8
; GFX908-NEXT: v_mov_b32_e32 v2, s9
; GFX908-NEXT: v_mov_b32_e32 v3, s19
; GFX908-NEXT: v_accvgpr_write_b32 a23, v0
; GFX908-NEXT: v_accvgpr_write_b32 a24, v1
; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s10
; GFX908-NEXT: v_mov_b32_e32 v1, s11
; GFX908-NEXT: v_mov_b32_e32 v2, s12
; GFX908-NEXT: v_mov_b32_e32 v5, s20
; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
; GFX908-NEXT: v_accvgpr_write_b32 a26, v0
; GFX908-NEXT: v_accvgpr_write_b32 a27, v1
; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
; GFX908-NEXT: v_mov_b32_e32 v0, s13
; GFX908-NEXT: v_mov_b32_e32 v1, s14
; GFX908-NEXT: v_mov_b32_e32 v2, s15
; GFX908-NEXT: v_mov_b32_e32 v3, 1.0
; GFX908-NEXT: v_accvgpr_write_b32 a4, v5
; GFX908-NEXT: v_accvgpr_write_b32 a29, v0
; GFX908-NEXT: v_accvgpr_write_b32 a30, v1
; GFX908-NEXT: v_accvgpr_write_b32 a31, v2
; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; GFX908-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
store <32 x float> %mai.1, ptr addrspace(1) %arg
ret void
}
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr addrspace(1) %arg) {
; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def a0
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
; GFX908-NEXT: v_accvgpr_write_b32 a4, v4
; GFX908-NEXT: v_accvgpr_write_b32 a5, v5
; GFX908-NEXT: v_accvgpr_write_b32 a6, v6
; GFX908-NEXT: v_accvgpr_write_b32 a7, v7
; GFX908-NEXT: v_accvgpr_write_b32 a8, v8
; GFX908-NEXT: v_accvgpr_write_b32 a9, v9
; GFX908-NEXT: v_accvgpr_write_b32 a10, v10
; GFX908-NEXT: v_accvgpr_write_b32 a11, v11
; GFX908-NEXT: v_accvgpr_write_b32 a12, v12
; GFX908-NEXT: v_accvgpr_write_b32 a13, v13
; GFX908-NEXT: v_accvgpr_write_b32 a14, v14
; GFX908-NEXT: v_accvgpr_write_b32 a15, v15
; GFX908-NEXT: v_accvgpr_write_b32 a16, v16
; GFX908-NEXT: v_accvgpr_write_b32 a17, v17
; GFX908-NEXT: v_accvgpr_write_b32 a18, v18
; GFX908-NEXT: v_accvgpr_write_b32 a19, v19
; GFX908-NEXT: v_accvgpr_write_b32 a20, v20
; GFX908-NEXT: v_accvgpr_write_b32 a21, v21
; GFX908-NEXT: v_accvgpr_write_b32 a22, v22
; GFX908-NEXT: v_accvgpr_write_b32 a23, v23
; GFX908-NEXT: v_accvgpr_write_b32 a24, v24
; GFX908-NEXT: v_accvgpr_write_b32 a25, v25
; GFX908-NEXT: v_accvgpr_write_b32 a26, v26
; GFX908-NEXT: v_accvgpr_write_b32 a27, v27
; GFX908-NEXT: v_accvgpr_write_b32 a28, v28
; GFX908-NEXT: v_accvgpr_write_b32 a29, v29
; GFX908-NEXT: v_accvgpr_write_b32 a30, v30
; GFX908-NEXT: v_accvgpr_write_b32 a31, v31
; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX908-NEXT: s_endpgm
bb:
%acc = call i32 asm sideeffect "; def $0", "={a0}"()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
store <32 x float> %mai.1, ptr addrspace(1) %arg
ret void
}
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr addrspace(1) %arg) {
; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use a[100:131]
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
; GFX908-NEXT: v_accvgpr_write_b32 a4, v4
; GFX908-NEXT: v_accvgpr_write_b32 a5, v5
; GFX908-NEXT: v_accvgpr_write_b32 a6, v6
; GFX908-NEXT: v_accvgpr_write_b32 a7, v7
; GFX908-NEXT: v_accvgpr_write_b32 a8, v8
; GFX908-NEXT: v_accvgpr_write_b32 a9, v9
; GFX908-NEXT: v_accvgpr_write_b32 a10, v10
; GFX908-NEXT: v_accvgpr_write_b32 a11, v11
; GFX908-NEXT: v_accvgpr_write_b32 a12, v12
; GFX908-NEXT: v_accvgpr_write_b32 a13, v13
; GFX908-NEXT: v_accvgpr_write_b32 a14, v14
; GFX908-NEXT: v_accvgpr_write_b32 a15, v15
; GFX908-NEXT: v_accvgpr_write_b32 a16, v16
; GFX908-NEXT: v_accvgpr_write_b32 a17, v17
; GFX908-NEXT: v_accvgpr_write_b32 a18, v18
; GFX908-NEXT: v_accvgpr_write_b32 a19, v19
; GFX908-NEXT: v_accvgpr_write_b32 a20, v20
; GFX908-NEXT: v_accvgpr_write_b32 a21, v21
; GFX908-NEXT: v_accvgpr_write_b32 a22, v22
; GFX908-NEXT: v_accvgpr_write_b32 a23, v23
; GFX908-NEXT: v_accvgpr_write_b32 a24, v24
; GFX908-NEXT: v_accvgpr_write_b32 a25, v25
; GFX908-NEXT: v_accvgpr_write_b32 a26, v26
; GFX908-NEXT: v_accvgpr_write_b32 a27, v27
; GFX908-NEXT: v_accvgpr_write_b32 a28, v28
; GFX908-NEXT: v_accvgpr_write_b32 a29, v29
; GFX908-NEXT: v_accvgpr_write_b32 a30, v30
; GFX908-NEXT: v_accvgpr_write_b32 a31, v31
; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX908-NEXT: s_endpgm
bb:
call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> poison)
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
store <32 x float> %mai.1, ptr addrspace(1) %arg
ret void
}
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addrspace(1) %arg) #0 {
; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def v0
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
; GFX908-NEXT: v_accvgpr_write_b32 a4, v4
; GFX908-NEXT: v_accvgpr_write_b32 a5, v5
; GFX908-NEXT: v_accvgpr_write_b32 a6, v6
; GFX908-NEXT: v_accvgpr_write_b32 a7, v7
; GFX908-NEXT: v_accvgpr_write_b32 a8, v8
; GFX908-NEXT: v_accvgpr_write_b32 a9, v9
; GFX908-NEXT: v_accvgpr_write_b32 a10, v10
; GFX908-NEXT: v_accvgpr_write_b32 a11, v11
; GFX908-NEXT: v_accvgpr_write_b32 a12, v12
; GFX908-NEXT: v_accvgpr_write_b32 a13, v13
; GFX908-NEXT: v_accvgpr_write_b32 a14, v14
; GFX908-NEXT: v_accvgpr_write_b32 a15, v15
; GFX908-NEXT: v_accvgpr_write_b32 a16, v16
; GFX908-NEXT: v_accvgpr_write_b32 a17, v17
; GFX908-NEXT: v_accvgpr_write_b32 a18, v18
; GFX908-NEXT: v_accvgpr_write_b32 a19, v19
; GFX908-NEXT: v_accvgpr_write_b32 a20, v20
; GFX908-NEXT: v_accvgpr_write_b32 a21, v21
; GFX908-NEXT: v_accvgpr_write_b32 a22, v22
; GFX908-NEXT: v_accvgpr_write_b32 a23, v23
; GFX908-NEXT: v_accvgpr_write_b32 a24, v24
; GFX908-NEXT: v_accvgpr_write_b32 a25, v25
; GFX908-NEXT: v_accvgpr_write_b32 a26, v26
; GFX908-NEXT: v_accvgpr_write_b32 a27, v27
; GFX908-NEXT: v_accvgpr_write_b32 a28, v28
; GFX908-NEXT: v_accvgpr_write_b32 a29, v29
; GFX908-NEXT: v_accvgpr_write_b32 a30, v30
; GFX908-NEXT: v_accvgpr_write_b32 a31, v31
; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX908-NEXT: s_endpgm
bb:
%acc = call i32 asm sideeffect "; def $0", "={v0}"()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
store <32 x float> %mai.1, ptr addrspace(1) %arg
ret void
}
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) #1 {
; GFX908-LABEL: test_mfma_f32_32x32x1f32_call:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX908-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX908-NEXT: s_mov_b32 s38, -1
; GFX908-NEXT: s_mov_b32 s39, 0xe00000
; GFX908-NEXT: s_add_u32 s36, s36, s11
; GFX908-NEXT: s_addc_u32 s37, s37, 0
; GFX908-NEXT: s_mov_b32 s12, s8
; GFX908-NEXT: s_add_u32 s8, s4, 44
; GFX908-NEXT: s_mov_b32 s13, s9
; GFX908-NEXT: s_addc_u32 s9, s5, 0
; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX908-NEXT: s_getpc_b64 s[4:5]
; GFX908-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
; GFX908-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX908-NEXT: s_mov_b32 s14, s10
; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX908-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX908-NEXT: s_mov_b32 s32, 0
; GFX908-NEXT: v_mov_b32_e32 v40, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX908-NEXT: global_load_dwordx4 v[28:31], v40, s[34:35] offset:112
; GFX908-NEXT: global_load_dwordx4 v[24:27], v40, s[34:35] offset:96
; GFX908-NEXT: global_load_dwordx4 v[20:23], v40, s[34:35] offset:80
; GFX908-NEXT: global_load_dwordx4 v[16:19], v40, s[34:35] offset:64
; GFX908-NEXT: global_load_dwordx4 v[12:15], v40, s[34:35] offset:48
; GFX908-NEXT: global_load_dwordx4 v[8:11], v40, s[34:35] offset:32
; GFX908-NEXT: global_load_dwordx4 v[4:7], v40, s[34:35] offset:16
; GFX908-NEXT: global_load_dwordx4 v[0:3], v40, s[34:35]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
; GFX908-NEXT: v_accvgpr_write_b32 a4, v4
; GFX908-NEXT: v_accvgpr_write_b32 a5, v5
; GFX908-NEXT: v_accvgpr_write_b32 a6, v6
; GFX908-NEXT: v_accvgpr_write_b32 a7, v7
; GFX908-NEXT: v_accvgpr_write_b32 a8, v8
; GFX908-NEXT: v_accvgpr_write_b32 a9, v9
; GFX908-NEXT: v_accvgpr_write_b32 a10, v10
; GFX908-NEXT: v_accvgpr_write_b32 a11, v11
; GFX908-NEXT: v_accvgpr_write_b32 a12, v12
; GFX908-NEXT: v_accvgpr_write_b32 a13, v13
; GFX908-NEXT: v_accvgpr_write_b32 a14, v14
; GFX908-NEXT: v_accvgpr_write_b32 a15, v15
; GFX908-NEXT: v_accvgpr_write_b32 a16, v16
; GFX908-NEXT: v_accvgpr_write_b32 a17, v17
; GFX908-NEXT: v_accvgpr_write_b32 a18, v18
; GFX908-NEXT: v_accvgpr_write_b32 a19, v19
; GFX908-NEXT: v_accvgpr_write_b32 a20, v20
; GFX908-NEXT: v_accvgpr_write_b32 a21, v21
; GFX908-NEXT: v_accvgpr_write_b32 a22, v22
; GFX908-NEXT: v_accvgpr_write_b32 a23, v23
; GFX908-NEXT: v_accvgpr_write_b32 a24, v24
; GFX908-NEXT: v_accvgpr_write_b32 a25, v25
; GFX908-NEXT: v_accvgpr_write_b32 a26, v26
; GFX908-NEXT: v_accvgpr_write_b32 a27, v27
; GFX908-NEXT: v_accvgpr_write_b32 a28, v28
; GFX908-NEXT: v_accvgpr_write_b32 a29, v29
; GFX908-NEXT: v_accvgpr_write_b32 a30, v30
; GFX908-NEXT: v_accvgpr_write_b32 a31, v31
; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:96
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
; GFX908-NEXT: global_store_dwordx4 v40, v[4:7], s[34:35] offset:112
; GFX908-NEXT: global_store_dwordx4 v40, v[8:11], s[34:35] offset:64
; GFX908-NEXT: global_store_dwordx4 v40, v[12:15], s[34:35] offset:80
; GFX908-NEXT: global_store_dwordx4 v40, v[16:19], s[34:35] offset:32
; GFX908-NEXT: global_store_dwordx4 v40, v[20:23], s[34:35] offset:48
; GFX908-NEXT: global_store_dwordx4 v40, v[24:27], s[34:35]
; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:16
; GFX908-NEXT: s_endpgm
bb:
call void @foo()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
store <32 x float> %mai.1, ptr addrspace(1) %arg
ret void
}
; We could avoid scan to find calls since we see these during lowering before selection.
; However, in SDag lowering and selection is done block by block, so it would only work
; in Global ISel.
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(1) %arg, i1 %c0) #1 {
; GFX908-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb:
; GFX908: ; %bb.0: ; %bb1
; GFX908-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0
; GFX908-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
; GFX908-NEXT: s_mov_b32 s54, -1
; GFX908-NEXT: s_mov_b32 s55, 0xe00000
; GFX908-NEXT: s_add_u32 s52, s52, s11
; GFX908-NEXT: s_mov_b32 s14, s10
; GFX908-NEXT: s_mov_b32 s12, s8
; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX908-NEXT: s_load_dword s8, s[4:5], 0x2c
; GFX908-NEXT: v_mov_b32_e32 v6, 1.0
; GFX908-NEXT: v_mov_b32_e32 v7, 0
; GFX908-NEXT: s_addc_u32 s53, s53, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0
; GFX908-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x40
; GFX908-NEXT: s_bitcmp0_b32 s8, 0
; GFX908-NEXT: s_mov_b32 s32, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v3, s36
; GFX908-NEXT: v_mov_b32_e32 v4, s37
; GFX908-NEXT: v_mov_b32_e32 v5, s40
; GFX908-NEXT: v_accvgpr_write_b32 a0, v3
; GFX908-NEXT: v_accvgpr_write_b32 a1, v4
; GFX908-NEXT: v_mov_b32_e32 v3, s38
; GFX908-NEXT: v_mov_b32_e32 v4, s39
; GFX908-NEXT: v_accvgpr_write_b32 a4, v5
; GFX908-NEXT: v_accvgpr_write_b32 a2, v3
; GFX908-NEXT: v_accvgpr_write_b32 a3, v4
; GFX908-NEXT: v_mov_b32_e32 v3, s41
; GFX908-NEXT: v_mov_b32_e32 v4, s42
; GFX908-NEXT: v_mov_b32_e32 v5, s43
; GFX908-NEXT: v_accvgpr_write_b32 a5, v3
; GFX908-NEXT: v_accvgpr_write_b32 a6, v4
; GFX908-NEXT: v_accvgpr_write_b32 a7, v5
; GFX908-NEXT: v_mov_b32_e32 v3, s44
; GFX908-NEXT: v_mov_b32_e32 v4, s45
; GFX908-NEXT: v_mov_b32_e32 v5, s46
; GFX908-NEXT: v_accvgpr_write_b32 a8, v3
; GFX908-NEXT: v_accvgpr_write_b32 a9, v4
; GFX908-NEXT: v_accvgpr_write_b32 a10, v5
; GFX908-NEXT: v_mov_b32_e32 v3, s47
; GFX908-NEXT: v_mov_b32_e32 v4, s48
; GFX908-NEXT: v_mov_b32_e32 v5, s49
; GFX908-NEXT: v_accvgpr_write_b32 a11, v3
; GFX908-NEXT: v_accvgpr_write_b32 a12, v4
; GFX908-NEXT: v_accvgpr_write_b32 a13, v5
; GFX908-NEXT: v_mov_b32_e32 v3, s50
; GFX908-NEXT: v_mov_b32_e32 v4, s51
; GFX908-NEXT: v_mov_b32_e32 v5, s16
; GFX908-NEXT: v_accvgpr_write_b32 a14, v3
; GFX908-NEXT: v_accvgpr_write_b32 a15, v4
; GFX908-NEXT: v_accvgpr_write_b32 a16, v5
; GFX908-NEXT: v_mov_b32_e32 v3, s17
; GFX908-NEXT: v_mov_b32_e32 v4, s18
; GFX908-NEXT: v_mov_b32_e32 v5, s19
; GFX908-NEXT: v_accvgpr_write_b32 a17, v3
; GFX908-NEXT: v_accvgpr_write_b32 a18, v4
; GFX908-NEXT: v_accvgpr_write_b32 a19, v5
; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: v_mov_b32_e32 v4, s21
; GFX908-NEXT: v_mov_b32_e32 v5, s22
; GFX908-NEXT: v_accvgpr_write_b32 a20, v3
; GFX908-NEXT: v_accvgpr_write_b32 a21, v4
; GFX908-NEXT: v_accvgpr_write_b32 a22, v5
; GFX908-NEXT: v_mov_b32_e32 v3, s23
; GFX908-NEXT: v_mov_b32_e32 v4, s24
; GFX908-NEXT: v_mov_b32_e32 v5, s25
; GFX908-NEXT: v_accvgpr_write_b32 a23, v3
; GFX908-NEXT: v_accvgpr_write_b32 a24, v4
; GFX908-NEXT: v_accvgpr_write_b32 a25, v5
; GFX908-NEXT: v_mov_b32_e32 v3, s26
; GFX908-NEXT: v_mov_b32_e32 v4, s27
; GFX908-NEXT: v_mov_b32_e32 v5, s28
; GFX908-NEXT: v_accvgpr_write_b32 a26, v3
; GFX908-NEXT: v_accvgpr_write_b32 a27, v4
; GFX908-NEXT: v_accvgpr_write_b32 a28, v5
; GFX908-NEXT: v_mov_b32_e32 v3, s29
; GFX908-NEXT: v_mov_b32_e32 v4, s30
; GFX908-NEXT: v_mov_b32_e32 v5, s31
; GFX908-NEXT: v_accvgpr_write_b32 a29, v3
; GFX908-NEXT: v_accvgpr_write_b32 a30, v4
; GFX908-NEXT: v_accvgpr_write_b32 a31, v5
; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v6, a27
; GFX908-NEXT: v_accvgpr_read_b32 v5, a26
; GFX908-NEXT: v_accvgpr_read_b32 v4, a25
; GFX908-NEXT: v_accvgpr_read_b32 v3, a24
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:96
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v6, a31
; GFX908-NEXT: v_accvgpr_read_b32 v5, a30
; GFX908-NEXT: v_accvgpr_read_b32 v4, a29
; GFX908-NEXT: v_accvgpr_read_b32 v3, a28
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:112
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v6, a19
; GFX908-NEXT: v_accvgpr_read_b32 v5, a18
; GFX908-NEXT: v_accvgpr_read_b32 v4, a17
; GFX908-NEXT: v_accvgpr_read_b32 v3, a16
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:64
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v6, a23
; GFX908-NEXT: v_accvgpr_read_b32 v5, a22
; GFX908-NEXT: v_accvgpr_read_b32 v4, a21
; GFX908-NEXT: v_accvgpr_read_b32 v3, a20
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:80
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v6, a11
; GFX908-NEXT: v_accvgpr_read_b32 v5, a10
; GFX908-NEXT: v_accvgpr_read_b32 v4, a9
; GFX908-NEXT: v_accvgpr_read_b32 v3, a8
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:32
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v6, a15
; GFX908-NEXT: v_accvgpr_read_b32 v5, a14
; GFX908-NEXT: v_accvgpr_read_b32 v4, a13
; GFX908-NEXT: v_accvgpr_read_b32 v3, a12
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:48
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7]
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v6, a7
; GFX908-NEXT: v_accvgpr_read_b32 v5, a6
; GFX908-NEXT: v_accvgpr_read_b32 v4, a5
; GFX908-NEXT: v_accvgpr_read_b32 v3, a4
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:16
; GFX908-NEXT: s_cbranch_scc1 .LBB6_2
; GFX908-NEXT: ; %bb.1: ; %bb2
; GFX908-NEXT: s_add_u32 s8, s4, 48
; GFX908-NEXT: s_mov_b32 s13, s9
; GFX908-NEXT: s_addc_u32 s9, s5, 0
; GFX908-NEXT: s_getpc_b64 s[4:5]
; GFX908-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
; GFX908-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX908-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX908-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX908-NEXT: .LBB6_2: ; %bb3
; GFX908-NEXT: s_endpgm
bb1:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
store <32 x float> %mai.1, ptr addrspace(1) %arg
br i1 %c0, label %bb2, label %bb3
br label %bb2
bb2:
call void @foo()
br label %bb3
bb3:
ret void
}
define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 {
; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
; GFX908-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
; GFX908-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
; GFX908-NEXT: v_accvgpr_write_b32 a1, v3
; GFX908-NEXT: v_accvgpr_write_b32 a2, v4
; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
; GFX908-NEXT: v_accvgpr_write_b32 a4, v6
; GFX908-NEXT: v_accvgpr_write_b32 a5, v7
; GFX908-NEXT: v_accvgpr_write_b32 a6, v8
; GFX908-NEXT: v_accvgpr_write_b32 a7, v9
; GFX908-NEXT: v_accvgpr_write_b32 a8, v10
; GFX908-NEXT: v_accvgpr_write_b32 a9, v11
; GFX908-NEXT: v_accvgpr_write_b32 a10, v12
; GFX908-NEXT: v_accvgpr_write_b32 a11, v13
; GFX908-NEXT: v_accvgpr_write_b32 a12, v14
; GFX908-NEXT: v_accvgpr_write_b32 a13, v15
; GFX908-NEXT: v_accvgpr_write_b32 a14, v16
; GFX908-NEXT: v_accvgpr_write_b32 a15, v17
; GFX908-NEXT: v_accvgpr_write_b32 a16, v18
; GFX908-NEXT: v_accvgpr_write_b32 a17, v19
; GFX908-NEXT: v_accvgpr_write_b32 a18, v20
; GFX908-NEXT: v_accvgpr_write_b32 a19, v21
; GFX908-NEXT: v_accvgpr_write_b32 a20, v22
; GFX908-NEXT: v_accvgpr_write_b32 a21, v23
; GFX908-NEXT: v_accvgpr_write_b32 a22, v24
; GFX908-NEXT: v_accvgpr_write_b32 a23, v25
; GFX908-NEXT: v_accvgpr_write_b32 a24, v26
; GFX908-NEXT: v_accvgpr_write_b32 a25, v27
; GFX908-NEXT: v_accvgpr_write_b32 a26, v28
; GFX908-NEXT: v_accvgpr_write_b32 a27, v29
; GFX908-NEXT: v_accvgpr_write_b32 a28, v30
; GFX908-NEXT: v_accvgpr_write_b32 a29, v31
; GFX908-NEXT: v_accvgpr_write_b32 a30, v32
; GFX908-NEXT: v_accvgpr_write_b32 a31, v33
; GFX908-NEXT: v_mov_b32_e32 v2, 1.0
; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
; GFX908-NEXT: v_accvgpr_read_b32 v3, a25
; GFX908-NEXT: v_accvgpr_read_b32 v2, a24
; GFX908-NEXT: v_accvgpr_read_b32 v9, a31
; GFX908-NEXT: v_accvgpr_read_b32 v8, a30
; GFX908-NEXT: v_accvgpr_read_b32 v7, a29
; GFX908-NEXT: v_accvgpr_read_b32 v6, a28
; GFX908-NEXT: v_accvgpr_read_b32 v13, a19
; GFX908-NEXT: v_accvgpr_read_b32 v12, a18
; GFX908-NEXT: v_accvgpr_read_b32 v11, a17
; GFX908-NEXT: v_accvgpr_read_b32 v10, a16
; GFX908-NEXT: v_accvgpr_read_b32 v17, a23
; GFX908-NEXT: v_accvgpr_read_b32 v16, a22
; GFX908-NEXT: v_accvgpr_read_b32 v15, a21
; GFX908-NEXT: v_accvgpr_read_b32 v14, a20
; GFX908-NEXT: v_accvgpr_read_b32 v21, a11
; GFX908-NEXT: v_accvgpr_read_b32 v20, a10
; GFX908-NEXT: v_accvgpr_read_b32 v19, a9
; GFX908-NEXT: v_accvgpr_read_b32 v18, a8
; GFX908-NEXT: v_accvgpr_read_b32 v25, a15
; GFX908-NEXT: v_accvgpr_read_b32 v24, a14
; GFX908-NEXT: v_accvgpr_read_b32 v23, a13
; GFX908-NEXT: v_accvgpr_read_b32 v22, a12
; GFX908-NEXT: v_accvgpr_read_b32 v29, a3
; GFX908-NEXT: v_accvgpr_read_b32 v28, a2
; GFX908-NEXT: v_accvgpr_read_b32 v27, a1
; GFX908-NEXT: v_accvgpr_read_b32 v26, a0
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v5, a7
; GFX908-NEXT: v_accvgpr_read_b32 v4, a6
; GFX908-NEXT: v_accvgpr_read_b32 v3, a5
; GFX908-NEXT: v_accvgpr_read_b32 v2, a4
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:32
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:48
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[26:29], off
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
store <32 x float> %mai.1, ptr addrspace(1) %arg
ret void
}
define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) #3 {
; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
; GFX908-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
; GFX908-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
; GFX908-NEXT: v_accvgpr_write_b32 a1, v3
; GFX908-NEXT: v_accvgpr_write_b32 a2, v4
; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
; GFX908-NEXT: v_accvgpr_write_b32 a4, v6
; GFX908-NEXT: v_accvgpr_write_b32 a5, v7
; GFX908-NEXT: v_accvgpr_write_b32 a6, v8
; GFX908-NEXT: v_accvgpr_write_b32 a7, v9
; GFX908-NEXT: v_accvgpr_write_b32 a8, v10
; GFX908-NEXT: v_accvgpr_write_b32 a9, v11
; GFX908-NEXT: v_accvgpr_write_b32 a10, v12
; GFX908-NEXT: v_accvgpr_write_b32 a11, v13
; GFX908-NEXT: v_accvgpr_write_b32 a12, v14
; GFX908-NEXT: v_accvgpr_write_b32 a13, v15
; GFX908-NEXT: v_accvgpr_write_b32 a14, v16
; GFX908-NEXT: v_accvgpr_write_b32 a15, v17
; GFX908-NEXT: v_accvgpr_write_b32 a16, v18
; GFX908-NEXT: v_accvgpr_write_b32 a17, v19
; GFX908-NEXT: v_accvgpr_write_b32 a18, v20
; GFX908-NEXT: v_accvgpr_write_b32 a19, v21
; GFX908-NEXT: v_accvgpr_write_b32 a20, v22
; GFX908-NEXT: v_accvgpr_write_b32 a21, v23
; GFX908-NEXT: v_accvgpr_write_b32 a22, v24
; GFX908-NEXT: v_accvgpr_write_b32 a23, v25
; GFX908-NEXT: v_accvgpr_write_b32 a24, v26
; GFX908-NEXT: v_accvgpr_write_b32 a25, v27
; GFX908-NEXT: v_accvgpr_write_b32 a26, v28
; GFX908-NEXT: v_accvgpr_write_b32 a27, v29
; GFX908-NEXT: v_accvgpr_write_b32 a28, v30
; GFX908-NEXT: v_accvgpr_write_b32 a29, v31
; GFX908-NEXT: v_accvgpr_write_b32 a30, v32
; GFX908-NEXT: v_accvgpr_write_b32 a31, v33
; GFX908-NEXT: v_mov_b32_e32 v2, 1.0
; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
; GFX908-NEXT: v_accvgpr_read_b32 v3, a25
; GFX908-NEXT: v_accvgpr_read_b32 v2, a24
; GFX908-NEXT: v_accvgpr_read_b32 v9, a31
; GFX908-NEXT: v_accvgpr_read_b32 v8, a30
; GFX908-NEXT: v_accvgpr_read_b32 v7, a29
; GFX908-NEXT: v_accvgpr_read_b32 v6, a28
; GFX908-NEXT: v_accvgpr_read_b32 v13, a19
; GFX908-NEXT: v_accvgpr_read_b32 v12, a18
; GFX908-NEXT: v_accvgpr_read_b32 v11, a17
; GFX908-NEXT: v_accvgpr_read_b32 v10, a16
; GFX908-NEXT: v_accvgpr_read_b32 v17, a23
; GFX908-NEXT: v_accvgpr_read_b32 v16, a22
; GFX908-NEXT: v_accvgpr_read_b32 v15, a21
; GFX908-NEXT: v_accvgpr_read_b32 v14, a20
; GFX908-NEXT: v_accvgpr_read_b32 v21, a11
; GFX908-NEXT: v_accvgpr_read_b32 v20, a10
; GFX908-NEXT: v_accvgpr_read_b32 v19, a9
; GFX908-NEXT: v_accvgpr_read_b32 v18, a8
; GFX908-NEXT: v_accvgpr_read_b32 v25, a15
; GFX908-NEXT: v_accvgpr_read_b32 v24, a14
; GFX908-NEXT: v_accvgpr_read_b32 v23, a13
; GFX908-NEXT: v_accvgpr_read_b32 v22, a12
; GFX908-NEXT: v_accvgpr_read_b32 v29, a3
; GFX908-NEXT: v_accvgpr_read_b32 v28, a2
; GFX908-NEXT: v_accvgpr_read_b32 v27, a1
; GFX908-NEXT: v_accvgpr_read_b32 v26, a0
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v5, a7
; GFX908-NEXT: v_accvgpr_read_b32 v4, a6
; GFX908-NEXT: v_accvgpr_read_b32 v3, a5
; GFX908-NEXT: v_accvgpr_read_b32 v2, a4
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:32
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:48
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[26:29], off
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
store <32 x float> %mai.1, ptr addrspace(1) %arg
ret void
}
declare void @foo()
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" "amdgpu-agpr-alloc"="0" }
attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }
attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0" }
attributes #3 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
; GFX90A: {{.*}}