llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll

2868 lines
150 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GFX1250
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefix=GISEL
define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x4_f32:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], 1.0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1250-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x4_f32:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], 1.0
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off
; GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_splat(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v6, 1.0 :: v_dual_mov_b32 v8, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v9, v6
; GFX1250-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v11, v6
; GFX1250-NEXT: v_dual_mov_b32 v12, v6 :: v_dual_mov_b32 v13, v6
; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1250-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x4_f32_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s2, 2.0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off
; GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_inlineable(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GFX1250-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GFX1250-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GFX1250-NEXT: v_mov_b32_e32 v13, v6
; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1250-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x4_f32_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off
; GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], 1.0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x32_bf16:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], 1.0
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
; GISEL-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
; GISEL-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_inlineable(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
; GFX1250-NEXT: v_mov_b32_e32 v25, v18
; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_mov_b32_e32 v18, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GISEL-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
; GISEL-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
; GISEL-NEXT: v_mov_b32_e32 v25, v18
; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>, i1 false, i1 false)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v18, 0x3f803f80 :: v_dual_mov_b32 v19, 1.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v20, v18 :: v_dual_mov_b32 v21, v18
; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21]
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_dual_mov_b32 v18, 0x3f803f80 :: v_dual_mov_b32 v19, 1.0
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_dual_mov_b32 v20, v18 :: v_dual_mov_b32 v21, v18
; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21]
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 0.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>, i1 false, i1 false)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_inlineable(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3fc03fc0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_mov_b32_e32 v21, v18
; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21]
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_mov_b32_e32 v18, 0x3fc03fc0
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GISEL-NEXT: v_mov_b32_e32 v21, v18
; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21]
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> <bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5>, i1 false, i1 false)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: global_store_b128 v[16:17], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
; GISEL-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
; GISEL-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: global_store_b128 v[16:17], v[26:29], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_inlinable(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_inlinable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
; GFX1250-NEXT: v_mov_b32_e32 v25, v18
; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: global_store_b128 v[16:17], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_inlinable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_mov_b32_e32 v18, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GISEL-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
; GISEL-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
; GISEL-NEXT: v_mov_b32_e32 v25, v18
; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: global_store_b128 v[16:17], v[26:29], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], 1.0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], 1.0
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s2, 2.0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
; GFX1250-NEXT: v_mov_b32_e32 v25, v18
; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], 1.0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], 1.0
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s2, 2.0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
; GFX1250-NEXT: v_mov_b32_e32 v25, v18
; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], 1.0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], 1.0
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s2, 2.0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
; GFX1250-NEXT: v_mov_b32_e32 v25, v18
; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], 1.0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], 1.0
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s2, 2.0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
; GFX1250-NEXT: v_mov_b32_e32 v25, v18
; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], 1.0
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], 1.0
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_mov_b32_e32 v21, v18
; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00
; GISEL-NEXT: s_mov_b32 s1, 0x3c004000
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_mov_b32_e32 v21, v18
; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x42004200
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], 1.0 neg_hi:[0,0,1]
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], 1.0 neg_hi:[0,0,1]
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 2, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_mov_b32_e32 v21, v18
; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1]
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00
; GISEL-NEXT: s_mov_b32 s1, 0x3c004000
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1]
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 2, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_mov_b32_e32 v21, v18
; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1]
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x42004200
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1]
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 2, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], 1.0
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], 1.0
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_mov_b32_e32 v21, v18
; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00
; GISEL-NEXT: s_mov_b32 s1, 0x3c004000
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_mov_b32_e32 v21, v18
; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x42004200
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], 1.0
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], 1.0
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_mov_b32_e32 v21, v18
; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21]
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00
; GISEL-NEXT: s_mov_b32 s1, 0x3c004000
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21]
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_mov_b32_e32 v21, v18
; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21]
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x42004200
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21]
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_i32_16x16x64_iu8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], 1
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 false, i1 false)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v18, 1 :: v_dual_mov_b32 v20, 2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1
; GISEL-NEXT: s_mov_b32 s2, 2
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> <i32 1, i32 1, i32 2, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 false, i1 false)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x80
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
; GFX1250-NEXT: v_mov_b32_e32 v25, v18
; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_movk_i32 s0, 0x80
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 false, i1 false)
store <8 x i32> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x32_f16:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], 1.0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x32_f16:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], 1.0
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_splat(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x32_f16_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s2, 2.0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_inlineable(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
; GFX1250-NEXT: v_mov_b32_e32 v25, v18
; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x32_f16_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x32_f16:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], 1.0
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x32_f16:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], 1.0
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_splat(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_mov_b32_e32 v21, v18
; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21]
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x32_f16_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00
; GISEL-NEXT: s_mov_b32 s1, 0x3c004000
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21]
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_inlineable(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
; GFX1250-NEXT: v_mov_b32_e32 v21, v18
; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21]
; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x32_f16_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x42004200
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21]
; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34
; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s1, 2.0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i32 %scale_src0, i32 1, i32 0, i32 %scale_src1, i1 true, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34
; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s1, 2.0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i32 1, i32 1, i32 0, i32 2, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
; GFX1250-NEXT: s_movk_i32 s0, 0x65
; GFX1250-NEXT: s_movk_i32 s1, 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: v_mov_b32_e32 v42, 0x64
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: v_mov_b32_e32 v43, 0x65
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v42, v43 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i32 1, i32 0, i32 100, i32 1, i32 0, i32 101, i1 true, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i64 %scale_src0, i32 1, i32 0, i64 %scale_src1, i1 true, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34
; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s1, 2.0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i64 1, i32 1, i32 0, i64 2, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
; GFX1250-NEXT: s_mov_b64 s[0:1], 0x65
; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x65
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v[42:43], v[44:45] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i32 1, i32 0, i64 100, i32 1, i32 0, i64 101, i1 true, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], 1.0
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], 1.0
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_mov_b32_e32 v37, v34
; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00
; GISEL-NEXT: s_mov_b32 s1, 0x3c004000
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_mov_b32_e32 v37, v34
; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x42004200
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], 1.0 neg_hi:[0,0,1]
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], 1.0 neg_hi:[0,0,1]
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 2, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_mov_b32_e32 v37, v34
; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1]
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00
; GISEL-NEXT: s_mov_b32 s1, 0x3c004000
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1]
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 2, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_mov_b32_e32 v37, v34
; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1]
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x42004200
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1]
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 2, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], 1.0
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], 1.0
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_mov_b32_e32 v37, v34
; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00
; GISEL-NEXT: s_mov_b32 s1, 0x3c004000
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_mov_b32_e32 v37, v34
; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x42004200
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], 1.0
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], 1.0
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_mov_b32_e32 v37, v34
; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37]
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00
; GISEL-NEXT: s_mov_b32 s1, 0x3c004000
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37]
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_mov_b32_e32 v37, v34
; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37]
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x42004200
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37]
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], 1.0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], 1.0
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34
; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s2, 2.0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], 1.0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], 1.0
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34
; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s2, 2.0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], 1.0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], 1.0
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34
; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s2, 2.0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], 1.0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], 1.0
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34
; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s2, 2.0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GISEL-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_32x16x128_f4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_32x16x128_f4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v28, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v29, v26
; GFX1250-NEXT: v_dual_mov_b32 v30, v26 :: v_dual_mov_b32 v31, v26
; GFX1250-NEXT: v_dual_mov_b32 v32, v26 :: v_dual_mov_b32 v33, v26
; GFX1250-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v26
; GFX1250-NEXT: v_dual_mov_b32 v36, v28 :: v_dual_mov_b32 v37, v26
; GFX1250-NEXT: v_dual_mov_b32 v38, v26 :: v_dual_mov_b32 v39, v26
; GFX1250-NEXT: v_dual_mov_b32 v40, v26 :: v_dual_mov_b32 v41, v26
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41]
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_32x16x128_f4_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s2, 2.0
; GISEL-NEXT: s_mov_b32 s14, s0
; GISEL-NEXT: s_mov_b32 s15, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s8, s0
; GISEL-NEXT: s_mov_b32 s9, s0
; GISEL-NEXT: s_mov_b32 s10, s2
; GISEL-NEXT: s_mov_b32 s11, s0
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41]
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26
; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26
; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26
; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
; GFX1250-NEXT: v_mov_b32_e32 v41, v26
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41]
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_f32_32x16x128_f4_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_mov_b32 s14, s0
; GISEL-NEXT: s_mov_b32 s15, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s8, s0
; GISEL-NEXT: s_mov_b32 s9, s0
; GISEL-NEXT: s_mov_b32 s10, s0
; GISEL-NEXT: s_mov_b32 s11, s0
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41]
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i32 %scale_src0, i32 1, i32 0, i32 %scale_src1, i1 true, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v27, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v28, v26 :: v_dual_mov_b32 v29, v26
; GFX1250-NEXT: v_dual_mov_b32 v30, v26 :: v_dual_mov_b32 v31, v26
; GFX1250-NEXT: v_dual_mov_b32 v32, v26 :: v_dual_mov_b32 v33, v26
; GFX1250-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v26
; GFX1250-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v26
; GFX1250-NEXT: v_dual_mov_b32 v38, v26 :: v_dual_mov_b32 v39, v26
; GFX1250-NEXT: v_dual_mov_b32 v40, v26 :: v_dual_mov_b32 v41, v26
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s1, 2.0
; GISEL-NEXT: s_mov_b32 s14, s0
; GISEL-NEXT: s_mov_b32 s15, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s8, s0
; GISEL-NEXT: s_mov_b32 s9, s0
; GISEL-NEXT: s_mov_b32 s10, s0
; GISEL-NEXT: s_mov_b32 s11, s0
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i32 1, i32 1, i32 0, i32 2, i1 false, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
; GFX1250-NEXT: s_movk_i32 s0, 0x65
; GFX1250-NEXT: s_movk_i32 s1, 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26
; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26
; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26
; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
; GFX1250-NEXT: v_mov_b32_e32 v41, v26
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: v_mov_b32_e32 v42, 0x64
; GISEL-NEXT: s_mov_b32 s14, s0
; GISEL-NEXT: s_mov_b32 s15, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s8, s0
; GISEL-NEXT: s_mov_b32 s9, s0
; GISEL-NEXT: s_mov_b32 s10, s0
; GISEL-NEXT: s_mov_b32 s11, s0
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; GISEL-NEXT: v_mov_b32_e32 v43, 0x65
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v42, v43 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i32 1, i32 0, i32 100, i32 1, i32 0, i32 101, i1 true, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i64 %scale_src0, i32 1, i32 0, i64 %scale_src1, i1 true, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_splat:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v27, 2.0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v28, v26 :: v_dual_mov_b32 v29, v26
; GFX1250-NEXT: v_dual_mov_b32 v30, v26 :: v_dual_mov_b32 v31, v26
; GFX1250-NEXT: v_dual_mov_b32 v32, v26 :: v_dual_mov_b32 v33, v26
; GFX1250-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v26
; GFX1250-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v26
; GFX1250-NEXT: v_dual_mov_b32 v38, v26 :: v_dual_mov_b32 v39, v26
; GFX1250-NEXT: v_dual_mov_b32 v40, v26 :: v_dual_mov_b32 v41, v26
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_splat:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 1.0
; GISEL-NEXT: s_mov_b32 s1, 2.0
; GISEL-NEXT: s_mov_b32 s14, s0
; GISEL-NEXT: s_mov_b32 s15, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s8, s0
; GISEL-NEXT: s_mov_b32 s9, s0
; GISEL-NEXT: s_mov_b32 s10, s0
; GISEL-NEXT: s_mov_b32 s11, s0
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i64 1, i32 1, i32 0, i64 2, i1 false, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
; GFX1250-NEXT: s_mov_b64 s[0:1], 0x65
; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26
; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26
; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26
; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
; GFX1250-NEXT: v_mov_b32_e32 v41, v26
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off
; GFX1250-NEXT: s_endpgm
;
; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable:
; GISEL: ; %bb.0: ; %bb
; GISEL-NEXT: s_mov_b32 s0, 0x40400000
; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64
; GISEL-NEXT: s_mov_b32 s14, s0
; GISEL-NEXT: s_mov_b32 s15, s0
; GISEL-NEXT: s_mov_b32 s1, s0
; GISEL-NEXT: s_mov_b32 s2, s0
; GISEL-NEXT: s_mov_b32 s3, s0
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: s_mov_b32 s5, s0
; GISEL-NEXT: s_mov_b32 s6, s0
; GISEL-NEXT: s_mov_b32 s7, s0
; GISEL-NEXT: s_mov_b32 s8, s0
; GISEL-NEXT: s_mov_b32 s9, s0
; GISEL-NEXT: s_mov_b32 s10, s0
; GISEL-NEXT: s_mov_b32 s11, s0
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s0
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x65
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v[42:43], v[44:45] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GISEL-NEXT: s_clause 0x3
; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off
; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16
; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GISEL-NEXT: s_endpgm
bb:
%res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i32 1, i32 0, i64 100, i32 1, i32 0, i64 101, i1 true, i1 false)
store <16 x float> %res, ptr addrspace(1) %out
ret void
}
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i32, i32, i32, i32, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i64, i32, i32, i64, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)
declare <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i32, i32, i32, i32, i1, i1)
declare <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i64, i32, i32, i64, i1, i1)