[GISel] Explicitly disable BF16 tablegen patterns. (#124113)

We currently have an issue where bf16 patters can be used to match fp16
types, as GISel does not know about the difference between the two. This
patch explicitly disables them to make sure that they are never used.

The opposite can also happen too, where fp16 patterns are used for
operators that should be bf16. So this also changes any operations with
bf16 types to now cause a fallback to SDAG.

The pass setup for GISel has been slightly adjusted to make sure that a
verify pass does not get added between AMD-SDAG and SIFixSGPRCopiesPass,
which otherwise can cause verifier issues when falling back.
This commit is contained in:
David Green 2025-01-27 22:21:12 +00:00 committed by GitHub
parent c310b4e7bd
commit 5a81a559d6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
33 changed files with 2055 additions and 4068 deletions

View File

@ -296,8 +296,21 @@ void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) {
MachinePreds[Edge].push_back(NewPred);
}
static bool containsBF16Type(const User &U) {
// BF16 cannot currently be represented by LLT, to avoid miscompiles we
// prevent any instructions using them. FIXME: This can be removed once LLT
// supports bfloat.
return U.getType()->getScalarType()->isBFloatTy() ||
any_of(U.operands(), [](Value *V) {
return V->getType()->getScalarType()->isBFloatTy();
});
}
bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
MachineIRBuilder &MIRBuilder) {
if (containsBF16Type(U))
return false;
// Get or create a virtual register for each value.
// Unless the value is a Constant => loadimm cst?
// or inline constant each time?
@ -317,6 +330,9 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U,
MachineIRBuilder &MIRBuilder) {
if (containsBF16Type(U))
return false;
Register Op0 = getOrCreateVReg(*U.getOperand(0));
Register Res = getOrCreateVReg(U);
uint32_t Flags = 0;
@ -334,6 +350,9 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
bool IRTranslator::translateCompare(const User &U,
MachineIRBuilder &MIRBuilder) {
if (containsBF16Type(U))
return false;
auto *CI = cast<CmpInst>(&U);
Register Op0 = getOrCreateVReg(*U.getOperand(0));
Register Op1 = getOrCreateVReg(*U.getOperand(1));
@ -1553,8 +1572,7 @@ bool IRTranslator::translateBitCast(const User &U,
bool IRTranslator::translateCast(unsigned Opcode, const User &U,
MachineIRBuilder &MIRBuilder) {
if (U.getType()->getScalarType()->isBFloatTy() ||
U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
if (containsBF16Type(U))
return false;
uint32_t Flags = 0;
@ -2647,6 +2665,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
bool IRTranslator::translateInlineAsm(const CallBase &CB,
MachineIRBuilder &MIRBuilder) {
if (containsBF16Type(CB))
return false;
const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering();
@ -2736,6 +2756,9 @@ bool IRTranslator::translateCallBase(const CallBase &CB,
}
bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
if (containsBF16Type(U))
return false;
const CallInst &CI = cast<CallInst>(U);
auto TII = MF->getTarget().getIntrinsicInfo();
const Function *F = CI.getCalledFunction();
@ -3371,6 +3394,9 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
bool IRTranslator::translateAtomicRMW(const User &U,
MachineIRBuilder &MIRBuilder) {
if (containsBF16Type(U))
return false;
const AtomicRMWInst &I = cast<AtomicRMWInst>(U);
auto Flags = TLI->getAtomicMemOperandFlags(I, *DL);

View File

@ -1017,7 +1017,7 @@ bool TargetPassConfig::addCoreISelPasses() {
if (Selector != SelectorType::GlobalISel || !isGlobalISelAbortEnabled())
DebugifyIsSafe = false;
// Add instruction selector passes.
// Add instruction selector passes for global isel if enabled.
if (Selector == SelectorType::GlobalISel) {
SaveAndRestore SavedAddingMachinePasses(AddingMachinePasses, true);
if (addIRTranslator())
@ -1043,15 +1043,14 @@ bool TargetPassConfig::addCoreISelPasses() {
// Pass to reset the MachineFunction if the ISel failed.
addPass(createResetMachineFunctionPass(
reportDiagnosticWhenGlobalISelFallback(), isGlobalISelAbortEnabled()));
}
// Provide a fallback path when we do not want to abort on
// not-yet-supported input.
if (!isGlobalISelAbortEnabled() && addInstSelector())
// Run the SDAG InstSelector, providing a fallback path when we do not want to
// abort on not-yet-supported input.
if (Selector != SelectorType::GlobalISel || !isGlobalISelAbortEnabled())
if (addInstSelector())
return true;
} else if (addInstSelector())
return true;
// Expand pseudo-instructions emitted by ISel. Don't run the verifier before
// FinalizeISel.
addPass(&FinalizeISelID);

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -mtriple=aarch64 -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc -mtriple=aarch64 -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
; RUN: llc -mtriple=aarch64 -global-isel=1 -mattr=+fullfp16,+bf16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
define float @fptrunc_f64_f32(double %a) {
; CHECK-LABEL: fptrunc_f64_f32:

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
; RUN: llc -global-isel -global-isel-abort=2 -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
declare hidden void @external_void_func_void() #0
@ -5594,48 +5594,14 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
; CHECK-LABEL: name: test_call_external_void_func_bf16_inreg
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr16
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_bf16_inreg
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]]
; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]]
; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY12]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32)
; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32)
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_bf16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1 (%ir-block.0):
call void @external_void_func_bf16_inreg(bfloat inreg %arg)
ret void
}

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
@ -284,17 +284,15 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80
; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
@ -359,21 +357,23 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
; GCN-LABEL: set_inactive_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, s2
; GCN-NEXT: v_mov_b32_e32 v4, s3
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v3, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v4, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80
; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1]
; GCN-NEXT: s_mov_b64 exec, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GCN-NEXT: s_mov_b64 exec, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: v_mov_b32_e32 v4, v2
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[3:4], off, s[0:3], 0
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
%tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefix=GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL %s
; Note: if you're adding tests here, also add them to
; lower-buffer-fat-pointers-contents-legalization.ll to verify the IR produced by
@ -629,7 +629,6 @@ define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <4 x bfloat>, ptr addrspace(7) %p
@ -647,10 +646,6 @@ define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
; GISEL-LABEL: store_v4bf16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GISEL-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 | FileCheck %s -check-prefix=GFX12-SDAG
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 | FileCheck %s -check-prefix=GFX12-GISEL
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 -global-isel-abort=2 | FileCheck %s -check-prefix=GFX12-GISEL
declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg)
declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg)
@ -15,7 +15,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret_offset(<2 x half> %val,
;
; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret_offset:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92
; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92
; GFX12-GISEL-NEXT: s_endpgm
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
ret void
@ -29,7 +29,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i
;
; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen
; GFX12-GISEL-NEXT: s_endpgm
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
@ -44,7 +44,7 @@ define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret_offset(<2 x half> %
;
; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret_offset:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: ; return to shader part epilog
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
@ -60,7 +60,7 @@ define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4
;
; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: ; return to shader part epilog
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@ -76,7 +76,7 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x
;
; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_ret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: ; return to shader part epilog
%orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@ -92,7 +92,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4
;
; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen
; GFX12-GISEL-NEXT: s_endpgm
%orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 | FileCheck %s -check-prefix=GFX950-SDAG
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 | FileCheck %s -check-prefix=GFX950-GISEL
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 | FileCheck %s -check-prefix=GFX950-GISEL
declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg)
declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32)
@ -20,9 +20,9 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <
;
; GFX950-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_ret:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX950-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[4:5], s[0:3], s4 idxen offen sc0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v1
; GFX950-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[2:3], s[0:3], s4 idxen offen sc0
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX950-GISEL-NEXT: flat_store_dword v[2:3], v0
@ -44,9 +44,9 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x bfloat> %val,
;
; GFX950-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_noret:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX950-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[4:5], s[0:3], s4 idxen offen
; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v1
; GFX950-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[2:3], s[0:3], s4 idxen offen
; GFX950-GISEL-NEXT: s_endpgm
%orig = call <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void

View File

@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
@ -594,35 +594,35 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) {
define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %b) {
; GISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat
; GISEL-GFX11: bb.1 (%ir-block.0):
; GISEL-GFX11: bb.0 (%ir-block.0):
; GISEL-GFX11-NEXT: liveins: $sgpr0, $vgpr8
; GISEL-GFX11-NEXT: {{ $}}
; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; GISEL-GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
; GISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY]]
; GISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY1]]
; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GISEL-GFX11-NEXT: $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1
; GISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GISEL-GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
; GISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
; GISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]]
; GISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1
; GISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; GISEL-GFX11-NEXT: S_ENDPGM 0
;
; GISEL-GFX10-LABEL: name: amdgpu_cs_chain_cc_bfloat
; GISEL-GFX10: bb.1 (%ir-block.0):
; GISEL-GFX10: bb.0 (%ir-block.0):
; GISEL-GFX10-NEXT: liveins: $sgpr0, $vgpr8
; GISEL-GFX10-NEXT: {{ $}}
; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; GISEL-GFX10-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
; GISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY]]
; GISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY1]]
; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GISEL-GFX10-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
; GISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GISEL-GFX10-NEXT: $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
; GISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]]
; GISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]]
; GISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1
; GISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; GISEL-GFX10-NEXT: S_ENDPGM 0
;
; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat

View File

@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
@ -873,32 +873,6 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inre
}
define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %b) {
; GISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat
; GISEL-GFX11: bb.1 (%ir-block.0):
; GISEL-GFX11-NEXT: liveins: $sgpr0, $vgpr8
; GISEL-GFX11-NEXT: {{ $}}
; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; GISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
; GISEL-GFX11-NEXT: [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; GISEL-GFX11-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; GISEL-GFX11-NEXT: S_ENDPGM 0
;
; GISEL-GFX10-LABEL: name: amdgpu_cs_chain_cc_bfloat
; GISEL-GFX10: bb.1 (%ir-block.0):
; GISEL-GFX10-NEXT: liveins: $sgpr0, $vgpr8
; GISEL-GFX10-NEXT: {{ $}}
; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; GISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
; GISEL-GFX10-NEXT: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; GISEL-GFX10-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; GISEL-GFX10-NEXT: S_ENDPGM 0
;
; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat
; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $vgpr8
@ -996,9 +970,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; GISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
; GISEL-GFX11-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec
; GISEL-GFX11-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec
; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; GISEL-GFX11-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; GISEL-GFX11-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; GISEL-GFX11-NEXT: S_ENDPGM 0
;
; GISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
@ -1020,10 +994,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
; DAGISEL-GFX11-WF32-NEXT: {{ $}}
; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
;
; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
@ -1032,10 +1006,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
; DAGISEL-GFX11-WF64-NEXT: {{ $}}
; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
;
; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_i16

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-GISEL %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-GISEL %s
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src0, <16 x float> %src1, float %scale)
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src0, <16 x float> %src1, float %scale)
@ -983,85 +983,35 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) {
}
define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv(<6 x i32> %src, float %scale) {
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[16:31], v[0:5], v6
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v16
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v17
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v18
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v19
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v20
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v21
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v22
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v23
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v16
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v17
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v18
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v19
; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v20
; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, v21
; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, v22
; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, v23
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, v6
; GCN-NEXT: v_mov_b32_e32 v21, v5
; GCN-NEXT: v_mov_b32_e32 v20, v4
; GCN-NEXT: v_mov_b32_e32 v19, v3
; GCN-NEXT: v_mov_b32_e32 v18, v2
; GCN-NEXT: v_mov_b32_e32 v17, v1
; GCN-NEXT: v_mov_b32_e32 v16, v0
; GCN-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float %scale)
ret <32 x bfloat> %ret
}
define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[16:31], v[0:5], v6
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v16
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v17
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v18
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v19
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v20
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v21
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v22
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v23
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v16
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v17
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v18
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v19
; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v20
; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, v21
; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, v22
; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, v23
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v16, s0
; GCN-NEXT: v_mov_b32_e32 v17, s1
; GCN-NEXT: v_mov_b32_e32 v18, s2
; GCN-NEXT: v_mov_b32_e32 v19, s3
; GCN-NEXT: v_mov_b32_e32 v20, s16
; GCN-NEXT: v_mov_b32_e32 v21, s17
; GCN-NEXT: s_mov_b32 s0, 0x42c80000
; GCN-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float 100.0)
ret <32 x bfloat> %ret
}
@ -1126,85 +1076,35 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) {
}
define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv(<6 x i32> %src, float %scale) {
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[16:31], v[0:5], v6
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v16
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v17
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v18
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v19
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v20
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v21
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v22
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v23
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v16
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v17
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v18
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v19
; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v20
; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, v21
; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, v22
; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, v23
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, v6
; GCN-NEXT: v_mov_b32_e32 v21, v5
; GCN-NEXT: v_mov_b32_e32 v20, v4
; GCN-NEXT: v_mov_b32_e32 v19, v3
; GCN-NEXT: v_mov_b32_e32 v18, v2
; GCN-NEXT: v_mov_b32_e32 v17, v1
; GCN-NEXT: v_mov_b32_e32 v16, v0
; GCN-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float %scale)
ret <32 x bfloat> %ret
}
define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl(<6 x i32> inreg %src) {
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[16:31], v[0:5], v6
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v16
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v17
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v18
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v19
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v20
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v21
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v22
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v23
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v16
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v17
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v18
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v19
; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v20
; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, v21
; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, v22
; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, v23
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v16, s0
; GCN-NEXT: v_mov_b32_e32 v17, s1
; GCN-NEXT: v_mov_b32_e32 v18, s2
; GCN-NEXT: v_mov_b32_e32 v19, s3
; GCN-NEXT: v_mov_b32_e32 v20, s16
; GCN-NEXT: v_mov_b32_e32 v21, s17
; GCN-NEXT: s_mov_b32 s0, 0x42c80000
; GCN-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float 100.0)
ret <32 x bfloat> %ret
}

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale)
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float %scale)
@ -19,44 +19,11 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv(<32 x bfloat> %src, float
;
; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_vv:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v17
; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v18
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v1
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v2
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v3
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v4
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v5
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v6
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v7
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v8
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v9
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v10
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v11
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v31, 16, v12
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v32, 16, v13
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v14
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v34, 16, v15
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v4, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v5, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v7, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v8, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v9, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v10, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v11, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v15, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: s_nop 0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v17
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[0:15], v16
; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off
; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16
; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off
; GFX950-GISEL-NEXT: s_endpgm
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale)
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@ -90,82 +57,26 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_sl(<32 x bfloat> inreg %src,
;
; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_sl:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_lshr_b32 s16, s0, 16
; GFX950-GISEL-NEXT: s_lshr_b32 s17, s1, 16
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s16, 16
; GFX950-GISEL-NEXT: s_and_b32 s0, s0, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s18, s2, 16
; GFX950-GISEL-NEXT: s_or_b32 s0, s16, s0
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s17, 16
; GFX950-GISEL-NEXT: s_and_b32 s1, s1, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s19, s3, 16
; GFX950-GISEL-NEXT: s_or_b32 s1, s16, s1
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s18, 16
; GFX950-GISEL-NEXT: s_and_b32 s2, s2, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s20, s4, 16
; GFX950-GISEL-NEXT: s_or_b32 s2, s16, s2
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s19, 16
; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s21, s5, 16
; GFX950-GISEL-NEXT: s_or_b32 s3, s16, s3
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s20, 16
; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s22, s6, 16
; GFX950-GISEL-NEXT: s_or_b32 s4, s16, s4
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s21, 16
; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s23, s7, 16
; GFX950-GISEL-NEXT: s_or_b32 s5, s16, s5
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s22, 16
; GFX950-GISEL-NEXT: s_and_b32 s6, s6, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s24, s8, 16
; GFX950-GISEL-NEXT: s_or_b32 s6, s16, s6
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s23, 16
; GFX950-GISEL-NEXT: s_and_b32 s7, s7, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s25, s9, 16
; GFX950-GISEL-NEXT: s_or_b32 s7, s16, s7
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s24, 16
; GFX950-GISEL-NEXT: s_and_b32 s8, s8, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s26, s10, 16
; GFX950-GISEL-NEXT: s_or_b32 s8, s16, s8
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s25, 16
; GFX950-GISEL-NEXT: s_and_b32 s9, s9, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s27, s11, 16
; GFX950-GISEL-NEXT: s_or_b32 s9, s16, s9
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s26, 16
; GFX950-GISEL-NEXT: s_and_b32 s10, s10, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s28, s12, 16
; GFX950-GISEL-NEXT: s_or_b32 s10, s16, s10
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s27, 16
; GFX950-GISEL-NEXT: s_and_b32 s11, s11, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s29, s13, 16
; GFX950-GISEL-NEXT: s_or_b32 s11, s16, s11
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s28, 16
; GFX950-GISEL-NEXT: s_and_b32 s12, s12, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s30, s14, 16
; GFX950-GISEL-NEXT: s_or_b32 s12, s16, s12
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s29, 16
; GFX950-GISEL-NEXT: s_and_b32 s13, s13, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s31, s15, 16
; GFX950-GISEL-NEXT: s_or_b32 s13, s16, s13
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s30, 16
; GFX950-GISEL-NEXT: s_and_b32 s14, s14, 0xffff
; GFX950-GISEL-NEXT: s_or_b32 s14, s16, s14
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s31, 16
; GFX950-GISEL-NEXT: s_and_b32 s15, s15, 0xffff
; GFX950-GISEL-NEXT: s_or_b32 s15, s16, s15
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], v24
; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4
; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6
; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7
; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8
; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9
; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10
; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11
; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12
; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14
; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15
; GFX950-GISEL-NEXT: s_mov_b32 s0, 0x42c80000
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], s0
; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16
; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
; GFX950-GISEL-NEXT: s_endpgm
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float 100.0)
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@ -252,44 +163,11 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_vv(<32 x bfloat> %src, float
;
; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_vv:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v17
; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v18
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v1
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v2
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v3
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v4
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v5
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v6
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v7
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v8
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v9
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v10
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v11
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v31, 16, v12
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v32, 16, v13
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v14
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v34, 16, v15
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v4, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v5, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v7, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v8, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v9, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v10, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v11, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v15, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: s_nop 0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v17
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[0:15], v16
; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off
; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16
; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off
; GFX950-GISEL-NEXT: s_endpgm
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float %scale)
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@ -323,82 +201,26 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_sl(<32 x bfloat> inreg %src,
;
; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_sl:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_lshr_b32 s16, s0, 16
; GFX950-GISEL-NEXT: s_lshr_b32 s17, s1, 16
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s16, 16
; GFX950-GISEL-NEXT: s_and_b32 s0, s0, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s18, s2, 16
; GFX950-GISEL-NEXT: s_or_b32 s0, s16, s0
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s17, 16
; GFX950-GISEL-NEXT: s_and_b32 s1, s1, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s19, s3, 16
; GFX950-GISEL-NEXT: s_or_b32 s1, s16, s1
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s18, 16
; GFX950-GISEL-NEXT: s_and_b32 s2, s2, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s20, s4, 16
; GFX950-GISEL-NEXT: s_or_b32 s2, s16, s2
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s19, 16
; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s21, s5, 16
; GFX950-GISEL-NEXT: s_or_b32 s3, s16, s3
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s20, 16
; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s22, s6, 16
; GFX950-GISEL-NEXT: s_or_b32 s4, s16, s4
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s21, 16
; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s23, s7, 16
; GFX950-GISEL-NEXT: s_or_b32 s5, s16, s5
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s22, 16
; GFX950-GISEL-NEXT: s_and_b32 s6, s6, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s24, s8, 16
; GFX950-GISEL-NEXT: s_or_b32 s6, s16, s6
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s23, 16
; GFX950-GISEL-NEXT: s_and_b32 s7, s7, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s25, s9, 16
; GFX950-GISEL-NEXT: s_or_b32 s7, s16, s7
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s24, 16
; GFX950-GISEL-NEXT: s_and_b32 s8, s8, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s26, s10, 16
; GFX950-GISEL-NEXT: s_or_b32 s8, s16, s8
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s25, 16
; GFX950-GISEL-NEXT: s_and_b32 s9, s9, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s27, s11, 16
; GFX950-GISEL-NEXT: s_or_b32 s9, s16, s9
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s26, 16
; GFX950-GISEL-NEXT: s_and_b32 s10, s10, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s28, s12, 16
; GFX950-GISEL-NEXT: s_or_b32 s10, s16, s10
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s27, 16
; GFX950-GISEL-NEXT: s_and_b32 s11, s11, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s29, s13, 16
; GFX950-GISEL-NEXT: s_or_b32 s11, s16, s11
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s28, 16
; GFX950-GISEL-NEXT: s_and_b32 s12, s12, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s30, s14, 16
; GFX950-GISEL-NEXT: s_or_b32 s12, s16, s12
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s29, 16
; GFX950-GISEL-NEXT: s_and_b32 s13, s13, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s31, s15, 16
; GFX950-GISEL-NEXT: s_or_b32 s13, s16, s13
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s30, 16
; GFX950-GISEL-NEXT: s_and_b32 s14, s14, 0xffff
; GFX950-GISEL-NEXT: s_or_b32 s14, s16, s14
; GFX950-GISEL-NEXT: s_lshl_b32 s16, s31, 16
; GFX950-GISEL-NEXT: s_and_b32 s15, s15, 0xffff
; GFX950-GISEL-NEXT: s_or_b32 s15, s16, s15
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], v24
; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4
; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6
; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7
; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8
; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9
; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10
; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11
; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12
; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14
; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15
; GFX950-GISEL-NEXT: s_mov_b32 s0, 0x42c80000
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], s0
; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16
; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
; GFX950-GISEL-NEXT: s_endpgm
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float 100.0)
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s
declare i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 %dst_sel)
declare i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 %dst_sel)

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950 %s
declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 %dst_sel)
declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 %dst_sel)

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> %src, i32 %sr, float %scale)
@ -19,42 +19,9 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_vv(<32 x bfloat> %src, i32
;
; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_vv:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v0
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v1
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v2
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v3
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v4
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v5
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v6
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v7
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v8
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v9
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v10
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v31, 16, v11
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v32, 16, v12
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v13
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v34, 16, v14
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v35, 16, v15
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v4, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v5, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v7, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v8, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v9, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v10, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v14, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v15, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: s_nop 0
; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], v[0:15], v16, v17
; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[20:23], off
; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[24:25], off offset:16
; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[20:23], off
; GFX950-GISEL-NEXT: s_endpgm
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@ -88,82 +55,26 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_sl(<32 x bfloat> inreg %sr
;
; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_sl:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_lshr_b32 s17, s0, 16
; GFX950-GISEL-NEXT: s_lshr_b32 s18, s1, 16
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s17, 16
; GFX950-GISEL-NEXT: s_and_b32 s0, s0, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s19, s2, 16
; GFX950-GISEL-NEXT: s_or_b32 s0, s17, s0
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s18, 16
; GFX950-GISEL-NEXT: s_and_b32 s1, s1, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s20, s3, 16
; GFX950-GISEL-NEXT: s_or_b32 s1, s17, s1
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s19, 16
; GFX950-GISEL-NEXT: s_and_b32 s2, s2, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s21, s4, 16
; GFX950-GISEL-NEXT: s_or_b32 s2, s17, s2
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s20, 16
; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s22, s5, 16
; GFX950-GISEL-NEXT: s_or_b32 s3, s17, s3
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s21, 16
; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s23, s6, 16
; GFX950-GISEL-NEXT: s_or_b32 s4, s17, s4
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s22, 16
; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s24, s7, 16
; GFX950-GISEL-NEXT: s_or_b32 s5, s17, s5
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s23, 16
; GFX950-GISEL-NEXT: s_and_b32 s6, s6, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s25, s8, 16
; GFX950-GISEL-NEXT: s_or_b32 s6, s17, s6
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s24, 16
; GFX950-GISEL-NEXT: s_and_b32 s7, s7, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s26, s9, 16
; GFX950-GISEL-NEXT: s_or_b32 s7, s17, s7
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s25, 16
; GFX950-GISEL-NEXT: s_and_b32 s8, s8, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s27, s10, 16
; GFX950-GISEL-NEXT: s_or_b32 s8, s17, s8
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s26, 16
; GFX950-GISEL-NEXT: s_and_b32 s9, s9, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s28, s11, 16
; GFX950-GISEL-NEXT: s_or_b32 s9, s17, s9
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s27, 16
; GFX950-GISEL-NEXT: s_and_b32 s10, s10, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s29, s12, 16
; GFX950-GISEL-NEXT: s_or_b32 s10, s17, s10
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s28, 16
; GFX950-GISEL-NEXT: s_and_b32 s11, s11, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s30, s13, 16
; GFX950-GISEL-NEXT: s_or_b32 s11, s17, s11
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s29, 16
; GFX950-GISEL-NEXT: s_and_b32 s12, s12, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s31, s14, 16
; GFX950-GISEL-NEXT: s_or_b32 s12, s17, s12
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s30, 16
; GFX950-GISEL-NEXT: s_and_b32 s13, s13, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s33, s15, 16
; GFX950-GISEL-NEXT: s_or_b32 s13, s17, s13
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s31, 16
; GFX950-GISEL-NEXT: s_and_b32 s14, s14, 0xffff
; GFX950-GISEL-NEXT: s_or_b32 s14, s17, s14
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s33, 16
; GFX950-GISEL-NEXT: s_and_b32 s15, s15, 0xffff
; GFX950-GISEL-NEXT: s_or_b32 s15, s17, s15
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4
; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6
; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7
; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8
; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9
; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10
; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11
; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12
; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14
; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15
; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000
; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[18:23], v[2:17], s16, v24
; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16
; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
; GFX950-GISEL-NEXT: s_endpgm
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float 100.0)
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@ -244,42 +155,9 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_vv(<32 x bfloat> %src, i32
;
; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_bf16_vv:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v0
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v1
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v2
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v3
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v4
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v5
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v6
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v7
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v8
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v9
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v10
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v31, 16, v11
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v32, 16, v12
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v13
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v34, 16, v14
; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v35, 16, v15
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v4, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v5, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v7, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v8, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v9, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v10, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v14, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: v_mov_b32_sdwa v15, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX950-GISEL-NEXT: s_nop 0
; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], v[0:15], v16, v17
; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[20:23], off
; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[24:25], off offset:16
; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[20:23], off
; GFX950-GISEL-NEXT: s_endpgm
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@ -313,82 +191,26 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_sl(<32 x bfloat> inreg %sr
;
; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_bf16_sl:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_lshr_b32 s17, s0, 16
; GFX950-GISEL-NEXT: s_lshr_b32 s18, s1, 16
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s17, 16
; GFX950-GISEL-NEXT: s_and_b32 s0, s0, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s19, s2, 16
; GFX950-GISEL-NEXT: s_or_b32 s0, s17, s0
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s18, 16
; GFX950-GISEL-NEXT: s_and_b32 s1, s1, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s20, s3, 16
; GFX950-GISEL-NEXT: s_or_b32 s1, s17, s1
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s19, 16
; GFX950-GISEL-NEXT: s_and_b32 s2, s2, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s21, s4, 16
; GFX950-GISEL-NEXT: s_or_b32 s2, s17, s2
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s20, 16
; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s22, s5, 16
; GFX950-GISEL-NEXT: s_or_b32 s3, s17, s3
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s21, 16
; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s23, s6, 16
; GFX950-GISEL-NEXT: s_or_b32 s4, s17, s4
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s22, 16
; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s24, s7, 16
; GFX950-GISEL-NEXT: s_or_b32 s5, s17, s5
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s23, 16
; GFX950-GISEL-NEXT: s_and_b32 s6, s6, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s25, s8, 16
; GFX950-GISEL-NEXT: s_or_b32 s6, s17, s6
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s24, 16
; GFX950-GISEL-NEXT: s_and_b32 s7, s7, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s26, s9, 16
; GFX950-GISEL-NEXT: s_or_b32 s7, s17, s7
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s25, 16
; GFX950-GISEL-NEXT: s_and_b32 s8, s8, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s27, s10, 16
; GFX950-GISEL-NEXT: s_or_b32 s8, s17, s8
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s26, 16
; GFX950-GISEL-NEXT: s_and_b32 s9, s9, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s28, s11, 16
; GFX950-GISEL-NEXT: s_or_b32 s9, s17, s9
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s27, 16
; GFX950-GISEL-NEXT: s_and_b32 s10, s10, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s29, s12, 16
; GFX950-GISEL-NEXT: s_or_b32 s10, s17, s10
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s28, 16
; GFX950-GISEL-NEXT: s_and_b32 s11, s11, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s30, s13, 16
; GFX950-GISEL-NEXT: s_or_b32 s11, s17, s11
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s29, 16
; GFX950-GISEL-NEXT: s_and_b32 s12, s12, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s31, s14, 16
; GFX950-GISEL-NEXT: s_or_b32 s12, s17, s12
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s30, 16
; GFX950-GISEL-NEXT: s_and_b32 s13, s13, 0xffff
; GFX950-GISEL-NEXT: s_lshr_b32 s33, s15, 16
; GFX950-GISEL-NEXT: s_or_b32 s13, s17, s13
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s31, 16
; GFX950-GISEL-NEXT: s_and_b32 s14, s14, 0xffff
; GFX950-GISEL-NEXT: s_or_b32 s14, s17, s14
; GFX950-GISEL-NEXT: s_lshl_b32 s17, s33, 16
; GFX950-GISEL-NEXT: s_and_b32 s15, s15, 0xffff
; GFX950-GISEL-NEXT: s_or_b32 s15, s17, s15
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4
; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6
; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7
; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8
; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9
; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10
; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11
; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12
; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14
; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15
; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000
; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_bf16 v[18:23], v[2:17], s16, v24
; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16
; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
; GFX950-GISEL-NEXT: s_endpgm
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float 100.0)
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
declare <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half>, float, i32, i1)
declare <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat>, float, i32, i1)

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s
declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3))
declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3))
@ -146,11 +146,11 @@ define amdgpu_ps void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr add
;
; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4bf16:
; GFX950-GISEL: ; %bb.0: ; %entry
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v1
; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX950-GISEL-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX950-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4

View File

@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950-ISEL
; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950-ISEL
declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 %clamp)
@ -40,17 +40,17 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
; GFX950-ISEL: ; %bb.0: ; %entry
; GFX950-ISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-ISEL-NEXT: s_load_dword s0, s[12:13], 0x0
; GFX950-ISEL-NEXT: s_load_dword s1, s[14:15], 0x0
; GFX950-ISEL-NEXT: s_load_dword s2, s[10:11], 0x0
; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX950-ISEL-NEXT: v_dot2_f32_bf16 v0, s2, v0, v1 clamp
; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX950-ISEL-NEXT: s_nop 1
; GFX950-ISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX950-ISEL-NEXT: v_mov_b32_e32 v2, s1
; GFX950-ISEL-NEXT: v_dot2_f32_bf16 v1, s2, v1, v2 clamp
; GFX950-ISEL-NEXT: s_nop 2
; GFX950-ISEL-NEXT: global_store_dword v0, v1, s[8:9]
; GFX950-ISEL-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
@ -100,17 +100,17 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
; GFX950-ISEL: ; %bb.0: ; %entry
; GFX950-ISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-ISEL-NEXT: s_load_dword s0, s[12:13], 0x0
; GFX950-ISEL-NEXT: s_load_dword s1, s[14:15], 0x0
; GFX950-ISEL-NEXT: s_load_dword s2, s[10:11], 0x0
; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX950-ISEL-NEXT: v_dot2c_f32_bf16_e32 v1, s2, v0
; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-ISEL-NEXT: s_nop 1
; GFX950-ISEL-NEXT: global_store_dword v0, v1, s[8:9]
; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX950-ISEL-NEXT: v_mov_b32_e32 v2, s1
; GFX950-ISEL-NEXT: v_dot2c_f32_bf16_e32 v2, s2, v1
; GFX950-ISEL-NEXT: s_nop 2
; GFX950-ISEL-NEXT: global_store_dword v0, v2, s[8:9]
; GFX950-ISEL-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1))
declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1))
declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-SDAG %s
; RUN: llc -mtriple=amdgcn -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-GISEL %s
; RUN: llc -mtriple=amdgcn -global-isel=1 -global-isel-abort=2 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-GISEL %s
define amdgpu_ps float @atomic_pk_add_f16_1d_v2(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) {
; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2:
@ -156,16 +156,6 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x bfl
;
; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4:
; GFX12-GISEL: ; %bb.0: ; %main_body
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v3, v0
; GFX12-GISEL-NEXT: v_or_b32_e32 v1, v4, v1
; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0
@ -190,16 +180,6 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_noret(<8 x i32> inreg %rsrc, <4
;
; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_noret:
; GFX12-GISEL: ; %bb.0: ; %main_body
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v3, v0
; GFX12-GISEL-NEXT: v_or_b32_e32 v1, v4, v1
; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
@ -219,16 +199,6 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_nt(<8 x i32> inreg %rsrc, <4 x
;
; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_nt:
; GFX12-GISEL: ; %bb.0: ; %main_body
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v3, v0
; GFX12-GISEL-NEXT: v_or_b32_e32 v1, v4, v1
; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
@ -1856,198 +1856,92 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
; SDAG-LABEL: test_mfma_f32_16x16x32_bf16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
; SDAG-NEXT: s_nop 6
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_f32_16x16x32_bf16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1
; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3
; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4
; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5
; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6
; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7
; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
; GISEL-NEXT: s_nop 6
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
; GISEL-NEXT: s_setpc_b64 s[30:31]
; GCN-LABEL: test_mfma_f32_16x16x32_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_accvgpr_write_b32 a0, v8
; GCN-NEXT: v_accvgpr_write_b32 a1, v9
; GCN-NEXT: v_accvgpr_write_b32 a2, v10
; GCN-NEXT: v_accvgpr_write_b32 a3, v11
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
; GCN-NEXT: s_nop 6
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
ret <4 x float> %result
}
define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
; SDAG-LABEL: test_mfma_f32_16x16x32_bf16__flags:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
; SDAG-NEXT: s_nop 6
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_f32_16x16x32_bf16__flags:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1
; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3
; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4
; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5
; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6
; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7
; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
; GISEL-NEXT: s_nop 6
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
; GISEL-NEXT: s_setpc_b64 s[30:31]
; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_accvgpr_write_b32 a0, v8
; GCN-NEXT: v_accvgpr_write_b32 a1, v9
; GCN-NEXT: v_accvgpr_write_b32 a2, v10
; GCN-NEXT: v_accvgpr_write_b32 a3, v11
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
; GCN-NEXT: s_nop 6
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
ret <4 x float> %result
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_mov_b32_e32 v8, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
; SDAG-NEXT: s_nop 6
; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 5
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GCN-NEXT: v_accvgpr_write_b32 a0, s0
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GCN-NEXT: v_accvgpr_write_b32 a1, s1
; GCN-NEXT: v_accvgpr_write_b32 a2, s2
; GCN-NEXT: v_accvgpr_write_b32 a3, s3
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
; GCN-NEXT: s_nop 6
; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
; GCN-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
store <4 x float> %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_mov_b32_e32 v8, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: s_nop 6
; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 5
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GCN-NEXT: v_accvgpr_write_b32 a0, s0
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GCN-NEXT: v_accvgpr_write_b32 a1, s1
; GCN-NEXT: v_accvgpr_write_b32 a2, s2
; GCN-NEXT: v_accvgpr_write_b32 a3, s3
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
; GCN-NEXT: s_nop 6
; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
; GCN-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
store <4 x float> %result, ptr addrspace(1) %out
ret void

View File

@ -1,9 +1,9 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
; GFX10PLUS-LABEL: {{^}}dpp8_test:
; GFX10PLUS: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}

View File

@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1)
declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1)

View File

@ -15,14 +15,12 @@
; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-f32-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-F32-GISEL %s
; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2f16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2F16-GISEL %s
; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2f16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2F16-GISEL %s
; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
; FIXME: These should fail when bfloat support is handled correctly
; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
; Make sure buffer fadd atomics with return values are not selected
; for gfx908 where they do not work.
@ -66,7 +64,7 @@ define <2 x half> @struct_ptr_buffer_atomic_fadd_v2f16_rtn(<2 x half> %val, ptr
;--- raw-ret-v2bf16-error.ll
; ERR-RAW-V2BF16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD
; ERR-RAW-V2BF16-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(<2 x s16>) = G_AMDGPU_BUFFER_ATOMIC_FADD
; ERR-RAW-V2BF16-GISEL: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD
define <2 x bfloat> @raw_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
%ret = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
@ -75,7 +73,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, <4
;--- struct-ret-v2bf16-error.ll
; ERR-STRUCT-V2BF16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD
; ERR-STRUCT-V2BF16-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(<2 x s16>) = G_AMDGPU_BUFFER_ATOMIC_FADD
; ERR-STRUCT-V2BF16-GISEL: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD
define <2 x bfloat> @struct_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
%ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_i1:

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s
declare i32 @llvm.amdgcn.readlane.i32(i32, i32) #0
declare i64 @llvm.amdgcn.readlane.i64(i64, i32) #0

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +1,8 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
; DPP control value 337 is valid for 64-bit DPP on gfx942

View File

@ -3,9 +3,9 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX802-GISEL %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX1010-GISEL %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel < %s | FileCheck -check-prefixes=GFX1100-GISEL %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX802-GISEL %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX1010-GISEL %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX1100-GISEL %s
declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
declare i64 @llvm.amdgcn.writelane.i64(i64, i32, i64) #0
@ -2128,10 +2128,10 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX802-GISEL-NEXT: flat_load_ushort v4, v[0:1]
; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3
; GFX802-GISEL-NEXT: v_readfirstlane_b32 m0, v3
; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX802-GISEL-NEXT: s_nop 1
; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0
; GFX802-GISEL-NEXT: flat_store_short v[0:1], v4
; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)

View File

@ -2346,6 +2346,20 @@ void GlobalISelEmitter::emitRunCustomAction(raw_ostream &OS) {
<< "}\n";
}
bool hasBFloatType(const TreePatternNode &Node) {
for (unsigned I = 0, E = Node.getNumTypes(); I < E; I++) {
auto Ty = Node.getType(I);
for (auto T : Ty)
if (T.second == MVT::bf16 ||
(T.second.isVector() && T.second.getScalarType() == MVT::bf16))
return true;
}
for (const TreePatternNode &C : Node.children())
if (hasBFloatType(C))
return true;
return false;
}
void GlobalISelEmitter::run(raw_ostream &OS) {
if (!UseCoverageFile.empty()) {
RuleCoverage = CodeGenCoverage();
@ -2382,6 +2396,13 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
if (Pat.getGISelShouldIgnore())
continue; // skip without warning
// Skip any patterns containing BF16 types, as GISel cannot currently tell
// the difference between fp16 and bf16. FIXME: This can be removed once
// BF16 is supported properly.
if (hasBFloatType(Pat.getSrcPattern()))
continue;
auto MatcherOrErr = runOnPattern(Pat);
// The pattern analysis can fail, indicating an unsupported pattern.