Reland: CodeGen: Record MMOs in finalizeBundle (#166689)

(original PR: #166210)

This allows more accurate alias analysis to apply at the bundle level.
This has a bunch of minor effects in post-RA scheduling that look mostly
beneficial to me, all of them in AMDGPU (the Thumb2 change is cosmetic).

The pre-existing (and unchanged) test in
CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll tests that MIR with a
bundle with MMOs can be parsed successfully.

v2:
- use cloneMergedMemRefs
- add another test to explicitly check the MMO bundling behavior

v3:
- use poison instead of undef to initialize the global variable in the
test

v4:
- treat bundle memory accesses as never trivially disjoint
This commit is contained in:
Nicolai Hähnle 2025-11-06 07:34:36 -08:00 committed by GitHub
parent 28c6ed5914
commit fa050eadab
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
50 changed files with 8179 additions and 8770 deletions

View File

@ -1161,6 +1161,8 @@ bool MIParser::parse(MachineInstr *&MI) {
MemOperands.push_back(MemOp);
if (Token.isNewlineOrEOF())
break;
if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace))
break;
if (Token.isNot(MIToken::comma))
return error("expected ',' before the next machine memory operand");
lex();

View File

@ -143,6 +143,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
SmallSet<Register, 8> KilledUseSet;
SmallSet<Register, 8> UndefUseSet;
SmallVector<std::pair<Register, Register>> TiedOperands;
SmallVector<MachineInstr *> MemMIs;
for (auto MII = FirstMI; MII != LastMI; ++MII) {
// Debug instructions have no effects to track.
if (MII->isDebugInstr())
@ -206,6 +207,9 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
MIB.setMIFlag(MachineInstr::FrameSetup);
if (MII->getFlag(MachineInstr::FrameDestroy))
MIB.setMIFlag(MachineInstr::FrameDestroy);
if (MII->mayLoadOrStore())
MemMIs.push_back(&*MII);
}
for (Register Reg : LocalDefs) {
@ -231,6 +235,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
assert(UseIdx < ExternUses.size());
MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
}
MIB->cloneMergedMemRefs(MF, MemMIs);
}
/// finalizeBundle - Same functionality as the previous finalizeBundle except

View File

@ -3917,6 +3917,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
if (isLDSDMA(MIa) || isLDSDMA(MIb))
return false;
if (MIa.isBundle() || MIb.isBundle())
return false;
// TODO: Should we check the address space from the MachineMemOperand? That
// would allow us to distinguish objects we know don't alias based on the
// underlying address space, even if it was lowered to a different one,

View File

@ -189,15 +189,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX10-NEXT: v_mov_b32_e32 v2, s1
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: s_lshr_b32 s1, s1, 24
; GFX10-NEXT: s_lshr_b32 s8, s2, 16
; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s5, s5, 8
; GFX10-NEXT: v_mov_b32_e32 v5, s0
; GFX10-NEXT: s_lshr_b32 s0, s7, 8
; GFX10-NEXT: v_mov_b32_e32 v6, s6
; GFX10-NEXT: v_mov_b32_e32 v7, s1
; GFX10-NEXT: s_lshr_b32 s1, s9, 8
; GFX10-NEXT: v_mov_b32_e32 v8, s5
; GFX10-NEXT: v_mov_b32_e32 v9, s0
; GFX10-NEXT: ds_write_b8 v1, v0
@ -208,18 +204,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX10-NEXT: ds_write_b8 v1, v8 offset:1
; GFX10-NEXT: ds_write_b8 v1, v9 offset:5
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v3, s2
; GFX10-NEXT: v_mov_b32_e32 v10, s1
; GFX10-NEXT: s_lshr_b32 s1, s1, 24
; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s0, s2, 24
; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
; GFX10-NEXT: v_mov_b32_e32 v7, s1
; GFX10-NEXT: s_lshr_b32 s1, s9, 8
; GFX10-NEXT: v_mov_b32_e32 v3, s2
; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_and_b32 s0, 0xffff, s3
; GFX10-NEXT: s_lshr_b32 s1, s3, 16
; GFX10-NEXT: v_mov_b32_e32 v10, s1
; GFX10-NEXT: s_lshr_b32 s0, s0, 8
; GFX10-NEXT: s_lshr_b32 s1, s3, 16
; GFX10-NEXT: v_mov_b32_e32 v2, s3
; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
; GFX10-NEXT: v_mov_b32_e32 v3, s0
; GFX10-NEXT: s_lshr_b32 s0, s3, 24
; GFX10-NEXT: v_mov_b32_e32 v4, s1

View File

@ -272,10 +272,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
; GFX906-NEXT: s_nop 0
; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
@ -288,6 +284,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB6_2
; GFX906-NEXT: ; %bb.1: ; %bb.1

File diff suppressed because it is too large Load Diff

View File

@ -17964,14 +17964,6 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-LABEL: bitcast_v40i8_to_v20i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v34, v10
; VI-NEXT: v_mov_b32_e32 v33, v8
; VI-NEXT: v_mov_b32_e32 v35, v6
@ -17988,6 +17980,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v31, v14
; VI-NEXT: v_mov_b32_e32 v37, v12
; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1
@ -18005,17 +18005,15 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29
; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0
; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4
; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@ -18046,7 +18044,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@ -18101,14 +18099,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_mov_b32_e32 v1, 0x300
; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_add_u16_e32 v0, 3, v54
; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_add_u16_e32 v0, 3, v53
; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_add_u16_e32 v0, 3, v51
; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_e32 v0, 3, v30
@ -23918,18 +23916,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-LABEL: bitcast_v40i8_to_v20f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v36, v4
; SI-NEXT: v_mov_b32_e32 v31, v2
; SI-NEXT: v_mov_b32_e32 v35, v0
@ -23943,6 +23929,18 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1
; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3
; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5
@ -23974,20 +23972,16 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr19
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4)
; SI-NEXT: s_waitcnt vmcnt(14) expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v0
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0)
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v4
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v32
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v33
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v34
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr32
@ -24027,7 +24021,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v6, 0xff, v30
; SI-NEXT: v_or_b32_e32 v6, v6, v47
; SI-NEXT: v_cvt_f32_f16_e32 v15, v6
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_and_b32_e32 v6, 0xff, v50
; SI-NEXT: v_or_b32_e32 v6, v6, v56
; SI-NEXT: v_cvt_f32_f16_e32 v32, v6
@ -24105,18 +24099,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_or_b32_e32 v0, v59, v0
; SI-NEXT: v_add_i32_e32 v19, vcc, 0x300, v0
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: s_movk_i32 s6, 0x300
; SI-NEXT: v_or_b32_e32 v0, v58, v0
; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_or_b32_e32 v0, v57, v0
; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_or_b32_e32 v0, v56, v0
@ -24232,14 +24225,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-LABEL: bitcast_v40i8_to_v20f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v34, v10
; VI-NEXT: v_mov_b32_e32 v33, v8
; VI-NEXT: v_mov_b32_e32 v35, v6
@ -24256,6 +24241,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v31, v14
; VI-NEXT: v_mov_b32_e32 v37, v12
; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1
@ -24273,17 +24266,15 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29
; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0
; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4
; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@ -24314,7 +24305,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@ -24369,14 +24360,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_mov_b32_e32 v1, 0x300
; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_add_u16_e32 v0, 3, v54
; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_add_u16_e32 v0, 3, v53
; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_add_u16_e32 v0, 3, v51
; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_e32 v0, 3, v30
@ -28252,15 +28243,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-LABEL: bitcast_v40i8_to_v5f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v36, v10
; SI-NEXT: v_mov_b32_e32 v35, v8
; SI-NEXT: v_mov_b32_e32 v34, v6
@ -28277,6 +28259,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: s_waitcnt expcnt(0)
@ -28295,17 +28286,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25
; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@ -28368,7 +28356,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v8, 0xff, v50
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_or_b32_e32 v8, v8, v23
@ -28508,7 +28496,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v8, v7
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
@ -28557,15 +28545,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-LABEL: bitcast_v40i8_to_v5f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v36, v10
; VI-NEXT: v_mov_b32_e32 v35, v8
; VI-NEXT: v_mov_b32_e32 v34, v6
@ -28582,6 +28561,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v38, v14
; VI-NEXT: v_mov_b32_e32 v37, v12
; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1
@ -28599,17 +28587,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25
; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29
; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0
; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4
; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@ -28640,7 +28625,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@ -28748,7 +28733,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_add_u16_e32 v7, 0x300, v7
; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v7, v7, v8
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_add_u16_e32 v8, 3, v50
; VI-NEXT: v_add_u16_e32 v10, 3, v49
; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@ -28780,15 +28765,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-LABEL: bitcast_v40i8_to_v5f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v36, v10
; GFX9-NEXT: v_mov_b32_e32 v35, v8
; GFX9-NEXT: v_mov_b32_e32 v34, v6
@ -28805,6 +28781,16 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v38, v14
; GFX9-NEXT: v_mov_b32_e32 v37, v12
; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1
@ -28822,17 +28808,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29
; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(8)
; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: s_waitcnt vmcnt(14)
; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: s_waitcnt vmcnt(12)
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@ -28863,7 +28849,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@ -28971,7 +28957,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7
; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v7, v7, v8
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_add_u16_e32 v8, 3, v50
; GFX9-NEXT: v_add_u16_e32 v9, 3, v49
; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@ -32301,15 +32287,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-LABEL: bitcast_v40i8_to_v5i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v36, v10
; SI-NEXT: v_mov_b32_e32 v35, v8
; SI-NEXT: v_mov_b32_e32 v34, v6
@ -32326,6 +32303,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: s_waitcnt expcnt(0)
@ -32344,17 +32330,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25
; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@ -32417,7 +32400,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v8, 0xff, v50
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_or_b32_e32 v8, v8, v23
@ -32557,7 +32540,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v8, v7
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
@ -32606,15 +32589,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-LABEL: bitcast_v40i8_to_v5i64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v36, v10
; VI-NEXT: v_mov_b32_e32 v35, v8
; VI-NEXT: v_mov_b32_e32 v34, v6
@ -32631,6 +32605,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v38, v14
; VI-NEXT: v_mov_b32_e32 v37, v12
; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1
@ -32648,17 +32631,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25
; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29
; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0
; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4
; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@ -32689,7 +32669,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@ -32797,7 +32777,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_add_u16_e32 v7, 0x300, v7
; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v7, v7, v8
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_add_u16_e32 v8, 3, v50
; VI-NEXT: v_add_u16_e32 v10, 3, v49
; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@ -32829,15 +32809,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-LABEL: bitcast_v40i8_to_v5i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v36, v10
; GFX9-NEXT: v_mov_b32_e32 v35, v8
; GFX9-NEXT: v_mov_b32_e32 v34, v6
@ -32854,6 +32825,16 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v38, v14
; GFX9-NEXT: v_mov_b32_e32 v37, v12
; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1
@ -32871,17 +32852,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29
; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(8)
; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: s_waitcnt vmcnt(14)
; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: s_waitcnt vmcnt(12)
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@ -32912,7 +32893,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@ -33020,7 +33001,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7
; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v7, v7, v8
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_add_u16_e32 v8, 3, v50
; GFX9-NEXT: v_add_u16_e32 v9, 3, v49
; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0

File diff suppressed because it is too large Load Diff

View File

@ -6164,6 +6164,14 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v36f16_to_v18i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@ -6180,36 +6188,28 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
@ -6224,14 +6224,12 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@ -13435,6 +13433,14 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v36f16_to_v18f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@ -13451,36 +13457,28 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
@ -13495,14 +13493,12 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@ -19656,6 +19652,14 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v36f16_to_v9i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@ -19672,36 +19676,28 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
@ -19716,14 +19712,12 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@ -25282,6 +25276,14 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v36f16_to_v9f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@ -25298,36 +25300,28 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
@ -25342,14 +25336,12 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@ -26798,22 +26790,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v36i16_to_v36f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12
@ -26838,6 +26814,22 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; kill: killed $vgpr48
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr63
@ -26865,7 +26857,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; kill: killed $vgpr48
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@ -26892,7 +26884,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v47, v9
; SI-NEXT: v_cvt_f32_f16_e32 v60, v10
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0)
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v39
; SI-NEXT: v_cvt_f32_f16_e32 v45, v11
; SI-NEXT: v_cvt_f32_f16_e32 v58, v12
@ -26977,7 +26969,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v27
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39
; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill

View File

@ -3541,6 +3541,17 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v20i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@ -3562,17 +3573,6 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_mov_b32_e32 v39, v16
@ -3594,13 +3594,10 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
@ -4914,7 +4911,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@ -4947,7 +4944,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@ -4980,7 +4977,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@ -5073,7 +5070,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@ -5106,7 +5103,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@ -5139,7 +5136,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@ -8520,7 +8517,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@ -8553,7 +8550,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@ -8586,7 +8583,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@ -8679,7 +8676,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@ -8712,7 +8709,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@ -8745,7 +8742,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@ -11740,6 +11737,17 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v20f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@ -11761,17 +11769,6 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_mov_b32_e32 v39, v16
@ -11793,13 +11790,10 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
@ -13113,7 +13107,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@ -13146,7 +13140,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@ -13179,7 +13173,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@ -13272,7 +13266,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@ -13305,7 +13299,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@ -13338,7 +13332,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@ -16833,7 +16827,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@ -16866,7 +16860,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@ -16899,7 +16893,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@ -16992,7 +16986,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@ -17025,7 +17019,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@ -17058,7 +17052,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@ -19249,6 +19243,17 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v10i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@ -19270,17 +19275,6 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_mov_b32_e32 v39, v16
@ -19302,13 +19296,10 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
@ -20622,7 +20613,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@ -20655,7 +20646,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@ -20688,7 +20679,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@ -20781,7 +20772,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@ -20814,7 +20805,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@ -20847,7 +20838,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@ -24238,7 +24229,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@ -24271,7 +24262,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@ -24304,7 +24295,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@ -24397,7 +24388,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@ -24430,7 +24421,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@ -24463,7 +24454,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@ -25988,6 +25979,17 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v10f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@ -26009,17 +26011,6 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_mov_b32_e32 v39, v16
@ -26041,13 +26032,10 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
@ -27361,7 +27349,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@ -27394,7 +27382,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@ -27427,7 +27415,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@ -27520,7 +27508,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@ -27553,7 +27541,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@ -27586,7 +27574,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@ -31014,7 +31002,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@ -31047,7 +31035,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@ -31080,7 +31068,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@ -31173,7 +31161,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@ -31206,7 +31194,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@ -31239,7 +31227,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xa
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@ -31389,6 +31377,17 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v40f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
@ -31405,17 +31404,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; kill: killed $vgpr40
; SI-NEXT: ; implicit-def: $vgpr40
@ -31472,7 +31460,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; kill: killed $vgpr40
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@ -31523,7 +31511,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v30
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_cvt_f32_f16_e32 v40, v48
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr3
@ -31623,7 +31610,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v27
; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@ -31643,7 +31629,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v38
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48
; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill

View File

@ -3792,6 +3792,17 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v44i16_to_v22i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v39, v16
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@ -3814,17 +3825,6 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v39, v16
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
@ -3842,9 +3842,8 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
@ -5329,7 +5328,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@ -5362,7 +5361,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@ -5395,7 +5394,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@ -5496,7 +5495,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@ -5529,7 +5528,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@ -5562,7 +5561,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
@ -9311,7 +9310,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@ -9344,7 +9343,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@ -9377,7 +9376,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@ -9478,7 +9477,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@ -9511,7 +9510,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@ -9544,7 +9543,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
@ -12755,6 +12754,17 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v44i16_to_v22f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v39, v16
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@ -12777,17 +12787,6 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v39, v16
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
@ -12805,9 +12804,8 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
@ -14292,7 +14290,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@ -14325,7 +14323,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@ -14358,7 +14356,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@ -14459,7 +14457,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@ -14492,7 +14490,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@ -14525,7 +14523,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
@ -18407,7 +18405,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@ -18440,7 +18438,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@ -18473,7 +18471,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@ -18574,7 +18572,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@ -18607,7 +18605,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@ -18640,7 +18638,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
@ -21004,6 +21002,17 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v44i16_to_v11i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v39, v16
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@ -21026,17 +21035,6 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v39, v16
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
@ -21054,9 +21052,8 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
@ -22541,7 +22538,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@ -22574,7 +22571,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@ -22607,7 +22604,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@ -22708,7 +22705,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@ -22741,7 +22738,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@ -22774,7 +22771,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
@ -26535,7 +26532,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@ -26568,7 +26565,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@ -26601,7 +26598,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@ -26702,7 +26699,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@ -26735,7 +26732,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@ -26768,7 +26765,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
@ -28420,6 +28417,17 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v44i16_to_v11f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v39, v16
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@ -28442,17 +28450,6 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v39, v16
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
@ -28470,9 +28467,8 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
@ -29957,7 +29953,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@ -29990,7 +29986,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@ -30023,7 +30019,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@ -30124,7 +30120,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@ -30157,7 +30153,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@ -30190,7 +30186,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
@ -33996,7 +33992,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@ -34029,7 +34025,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@ -34062,7 +34058,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@ -34163,7 +34159,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@ -34196,7 +34192,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@ -34229,7 +34225,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xc
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264

View File

@ -4045,6 +4045,22 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v48i16_to_v24i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@ -4069,22 +4085,6 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
@ -4100,21 +4100,14 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
@ -5806,7 +5799,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@ -5839,7 +5832,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@ -5872,7 +5865,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@ -5979,7 +5972,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@ -6012,7 +6005,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@ -6045,7 +6038,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@ -8179,6 +8172,8 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v48f16_to_v24i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@ -8195,8 +8190,6 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
@ -8223,34 +8216,34 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
@ -10214,7 +10207,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@ -10247,7 +10240,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@ -10280,7 +10273,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@ -10387,7 +10380,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@ -10420,7 +10413,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@ -10453,7 +10446,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@ -13882,6 +13875,22 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v48i16_to_v24f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@ -13906,22 +13915,6 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
@ -13937,21 +13930,14 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
@ -15643,7 +15629,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@ -15676,7 +15662,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@ -15709,7 +15695,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@ -15816,7 +15802,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@ -15849,7 +15835,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@ -15882,7 +15868,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@ -18157,6 +18143,8 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v48f16_to_v24f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@ -18173,8 +18161,6 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
@ -18201,34 +18187,34 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
@ -20192,7 +20178,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@ -20225,7 +20211,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@ -20258,7 +20244,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@ -20365,7 +20351,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@ -20398,7 +20384,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@ -20431,7 +20417,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@ -22982,6 +22968,22 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v48i16_to_v12i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@ -23006,22 +23008,6 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
@ -23037,21 +23023,14 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
@ -24743,7 +24722,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@ -24776,7 +24755,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@ -24809,7 +24788,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@ -24916,7 +24895,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@ -24949,7 +24928,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@ -24982,7 +24961,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@ -27128,6 +27107,8 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v48f16_to_v12i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@ -27144,8 +27125,6 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
@ -27172,34 +27151,34 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
@ -29163,7 +29142,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@ -29196,7 +29175,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@ -29229,7 +29208,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@ -29336,7 +29315,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@ -29369,7 +29348,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@ -29402,7 +29381,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@ -31199,6 +31178,22 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v48i16_to_v12f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@ -31223,22 +31218,6 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v48, v14
; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v50, v10
; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v52, v6
; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_mov_b32_e32 v54, v2
; SI-NEXT: v_mov_b32_e32 v55, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
@ -31254,21 +31233,14 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
@ -32960,7 +32932,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@ -32993,7 +32965,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@ -33026,7 +32998,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@ -33133,7 +33105,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@ -33166,7 +33138,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@ -33199,7 +33171,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@ -35392,6 +35364,8 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v48f16_to_v12f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@ -35408,8 +35382,6 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
@ -35436,34 +35408,34 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
@ -37427,7 +37399,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@ -37460,7 +37432,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@ -37493,7 +37465,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@ -37600,7 +37572,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@ -37633,7 +37605,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@ -37666,7 +37638,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
; GFX11-TRUE16-NEXT: s_clause 0xe
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@ -41255,6 +41227,11 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; SI-LABEL: bitcast_v48f16_to_v48i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
@ -41271,11 +41248,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_cvt_f16_f32_e32 v61, v2
; SI-NEXT: v_cvt_f16_f32_e32 v55, v3
@ -41320,16 +41292,12 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v50, s25
; SI-NEXT: v_cvt_f16_f32_e32 v16, s26
; SI-NEXT: v_cvt_f16_f32_e32 v29, s29
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v32
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f16_f32_e32 v43, v33
; SI-NEXT: v_cvt_f16_f32_e32 v32, v20
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v25, v35
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v37
; SI-NEXT: v_cvt_f16_f32_e32 v20, s22
; SI-NEXT: s_and_b64 s[4:5], vcc, exec

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -968,14 +968,14 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v29, vcc
; GFX8-NEXT: s_movk_i32 s4, 0x70
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[28:29]
; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
; GFX8-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[20:21]
; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@ -9552,6 +9552,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1
; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
@ -9563,7 +9564,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
; GFX8-NEXT: flat_load_ushort v44, v[1:2]
; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1

View File

@ -450,23 +450,38 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
@ -976,23 +991,38 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
@ -1159,24 +1189,23 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; SDAG-GFX1100-NEXT: s_mov_b32 s9, s12
; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX1100-NEXT: s_mov_b32 s6, s3
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0
; SDAG-GFX1100-NEXT: s_mov_b32 s8, s1
; SDAG-GFX1100-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0
; SDAG-GFX1100-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
; SDAG-GFX1100-NEXT: s_clause 0x1
; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x54
; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
; SDAG-GFX1100-NEXT: s_mov_b32 s5, s12
; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v5, s0
; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3
; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12
; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2
; SDAG-GFX1100-NEXT: s_mov_b32 s2, s1
; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0)
@ -1220,12 +1249,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; GISEL-GFX1100-NEXT: s_mov_b32 s8, s1
; GISEL-GFX1100-NEXT: s_mov_b32 s9, s2
; GISEL-GFX1100-NEXT: s_mov_b32 s10, s3
; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
; GISEL-GFX1100-NEXT: s_clause 0x1
; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x54
; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX1100-NEXT: v_mov_b32_e32 v5, s0
; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
; GISEL-GFX1100-NEXT: s_mov_b32 s4, s1
; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2
; GISEL-GFX1100-NEXT: s_mov_b32 s6, s3

View File

@ -4253,6 +4253,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@ -4260,7 +4261,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; VI-NEXT: s_mov_b32 s38, -1
; VI-NEXT: s_mov_b32 s39, 0xe80000
; VI-NEXT: s_add_u32 s36, s36, s3
@ -4272,7 +4272,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32
; VI-NEXT: s_swappc_b64 s[30:31], s[8:9]
; VI-NEXT: s_endpgm
@ -4285,6 +4285,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@ -4292,7 +4293,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; CI-NEXT: s_mov_b32 s38, -1
; CI-NEXT: s_mov_b32 s39, 0xe8f000
; CI-NEXT: s_add_u32 s36, s36, s3
@ -4304,7 +4304,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: s_waitcnt vmcnt(7)
; CI-NEXT: s_waitcnt vmcnt(6)
; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32
; CI-NEXT: s_swappc_b64 s[30:31], s[8:9]
; CI-NEXT: s_endpgm
@ -4317,6 +4317,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@ -4324,7 +4325,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; GFX9-NEXT: s_mov_b32 s38, -1
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s3
@ -4336,7 +4336,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GFX9-NEXT: s_endpgm

View File

@ -851,12 +851,12 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write_b8 v0, v1 offset:9
; CI-NEXT: ds_write_b8 v0, v2 offset:13
; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1
; CI-NEXT: ds_write_b8 v0, v1 offset:5
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; CI-NEXT: ds_write_b8 v0, v1 offset:9
; CI-NEXT: ds_write_b8 v0, v2 offset:13
; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2
; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2

View File

@ -3755,42 +3755,44 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; CI-NEXT: v_cvt_f16_f32_e32 v13, v22
; CI-NEXT: v_or_b32_e32 v10, v14, v10
; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4
; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32
; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; CI-NEXT: v_or_b32_e32 v17, v18, v17
; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
; CI-NEXT: v_or_b32_e32 v17, v18, v17
; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
; CI-NEXT: v_cvt_f16_f32_e32 v22, v27
; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; CI-NEXT: v_or_b32_e32 v13, v16, v13
; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12
; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; CI-NEXT: v_or_b32_e32 v19, v20, v19
; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v21
; CI-NEXT: v_cvt_f16_f32_e32 v21, v30
; CI-NEXT: v_or_b32_e32 v20, v22, v20
; CI-NEXT: v_cvt_f16_f32_e32 v22, v29
; CI-NEXT: s_waitcnt vmcnt(6)
; CI-NEXT: s_waitcnt vmcnt(8)
; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
; CI-NEXT: s_waitcnt vmcnt(5)
; CI-NEXT: s_waitcnt vmcnt(7)
; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; CI-NEXT: v_or_b32_e32 v21, v22, v21
; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
; CI-NEXT: s_waitcnt vmcnt(3)
; CI-NEXT: s_waitcnt vmcnt(5)
; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
; CI-NEXT: s_waitcnt vmcnt(2)
; CI-NEXT: s_waitcnt vmcnt(4)
; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
@ -3802,6 +3804,27 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; CI-NEXT: s_waitcnt vmcnt(6)
; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
; CI-NEXT: s_waitcnt vmcnt(5)
; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; CI-NEXT: v_or_b32_e32 v14, v15, v14
; CI-NEXT: s_waitcnt vmcnt(3)
; CI-NEXT: v_cvt_f16_f32_e32 v15, v16
; CI-NEXT: v_cvt_f16_f32_e32 v16, v18
; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; CI-NEXT: v_or_b32_e32 v12, v12, v15
; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0
; CI-NEXT: v_or_b32_e32 v11, v16, v11
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
; CI-NEXT: s_waitcnt vmcnt(0)
@ -3968,28 +3991,6 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-NEXT: v_or_b32_e32 v31, v32, v31
; CI-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0
; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4
; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; CI-NEXT: v_or_b32_e32 v14, v15, v14
; CI-NEXT: v_cvt_f16_f32_e32 v15, v16
; CI-NEXT: v_cvt_f16_f32_e32 v16, v18
; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; CI-NEXT: v_or_b32_e32 v12, v12, v15
; CI-NEXT: v_or_b32_e32 v11, v16, v11
; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0
; CI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen
; CI-NEXT: v_add_i32_e32 v11, vcc, 64, v0
; CI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen

View File

@ -1,6 +1,19 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -passes=finalizebundle-test %s -o - | FileCheck %s
--- |
@foo = addrspace(3) global i32 poison
define void @test_overlap() { unreachable }
define void @test_dead_redef() { unreachable }
define void @test_tied() { unreachable }
define void @test_mmo_merge1() { unreachable }
define void @test_mmo_merge2() { unreachable }
define void @test_mmo_drop() { unreachable }
...
---
name: test_overlap
body: |
@ -47,3 +60,42 @@ body: |
%1:vgpr_32 = COPY %0:vgpr_32
%2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec
...
---
name: test_mmo_merge1
body: |
bb.0:
; CHECK-LABEL: name: test_mmo_merge1
; CHECK: BUNDLE implicit-def %0, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3) {
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32
; CHECK-NEXT: DS_WRITE_B32_gfx9 %1:vgpr_32, internal [[COPY]], 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
; CHECK-NEXT: }
%1:vgpr_32 = COPY %0:vgpr_32
DS_WRITE_B32_gfx9 %0, %1, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
...
---
name: test_mmo_merge2
body: |
bb.0:
; CHECK-LABEL: name: test_mmo_merge2
; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3), (store (s32) into @foo + 4, addrspace 3) {
; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3)
; CHECK-NEXT: }
DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3)
...
---
name: test_mmo_drop
body: |
bb.0:
; CHECK-LABEL: name: test_mmo_drop
; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec {
; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec
; CHECK-NEXT: }
DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec
...

View File

@ -807,7 +807,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX10-NEXT: buffer_store_dword v95, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: v_writelane_b32 v100, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: s_clause 0x1f
; GFX10-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v95, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:4
; GFX10-NEXT: buffer_load_dword v93, off, s[0:3], s33 offset:8
@ -863,7 +863,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX11-NEXT: s_mov_b32 s1, return_100xi32@abs32@hi
; GFX11-NEXT: s_mov_b32 s0, return_100xi32@abs32@lo
; GFX11-NEXT: s_addk_i32 s32, 0x90
; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:124
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:120
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:116
@ -898,7 +898,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v95, s33
; GFX11-NEXT: v_writelane_b32 v100, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v95, off, s33
; GFX11-NEXT: scratch_load_b32 v94, off, s33 offset:4
; GFX11-NEXT: scratch_load_b32 v93, off, s33 offset:8
@ -2518,7 +2518,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX11-LABEL: return_72xi32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0xc
; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:212
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:208
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:204
@ -2551,23 +2551,23 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:96
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:92
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:88
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:112
; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:108
; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:104
; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:124
; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:120
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-NEXT: s_clause 0x10
; GFX11-NEXT: scratch_load_b32 v15, off, s32 offset:144
; GFX11-NEXT: scratch_load_b32 v14, off, s32 offset:140
; GFX11-NEXT: scratch_load_b32 v13, off, s32 offset:136
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-NEXT: s_clause 0xd
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:156
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:152
@ -2608,7 +2608,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-NEXT: s_clause 0xc
; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:164
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:168
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:172
@ -2641,21 +2641,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_add_i32 s32, s32, 0x28000
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
@ -2733,6 +2718,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX9-NEXT: v_mov_b32_e32 v29, 0
; GFX9-NEXT: v_mov_b32_e32 v30, 0
; GFX9-NEXT: v_mov_b32_e32 v31, 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v63, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37]
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636
@ -2914,21 +2914,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: s_mov_b32 s38, s34
; GFX10-NEXT: s_mov_b32 s34, s32
; GFX10-NEXT: s_add_i32 s32, s32, 0x14000
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: v_writelane_b32 v63, s30, 0
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
@ -2971,12 +2957,11 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160
; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
; GFX10-NEXT: v_writelane_b32 v63, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: v_mov_b32_e32 v6, 0
; GFX10-NEXT: v_mov_b32_e32 v7, 0
@ -3006,6 +2991,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: v_mov_b32_e32 v31, 0
; GFX10-NEXT: s_mov_b32 s37, return_72xi32@abs32@hi
; GFX10-NEXT: s_mov_b32 s36, return_72xi32@abs32@lo
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: v_writelane_b32 v63, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37]
; GFX10-NEXT: s_clause 0x28
@ -3138,7 +3138,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152
; GFX10-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156
; GFX10-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160
; GFX10-NEXT: s_clause 0x7
; GFX10-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1536
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1540
; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1544
@ -3151,7 +3151,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: v_mov_b32_e32 v1, 42
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37]
; GFX10-NEXT: s_clause 0xe
; GFX10-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v62, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4
; GFX10-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8
@ -3199,7 +3199,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: s_mov_b32 s36, s34
; GFX11-NEXT: s_mov_b32 s34, s32
; GFX11-NEXT: s_addk_i32 s32, 0xa00
; GFX11-NEXT: s_clause 0xb
; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:44
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:40
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:36
@ -3341,18 +3341,18 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: s_add_i32 s2, s32, 16
; GFX11-NEXT: v_mov_b32_e32 v30, v46
; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584
; GFX11-NEXT: s_clause 0x3 ; 64-byte Folded Reload
; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568
; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552
; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536
; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584
; GFX11-NEXT: s_add_i32 s2, s33, 0x400
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v0, s2
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, 42
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_clause 0xb
; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v59, off, s33
; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:4
; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:8

View File

@ -255,11 +255,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16
; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off
; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:64
; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70
; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 48
; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60
; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 32
; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70
; GCN-SDAG-NEXT: v_mov_b64_e32 v[64:65], 16
; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60
; GCN-SDAG-NEXT: v_mov_b64_e32 v[66:67], 0
; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 0x50
; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 64

View File

@ -11,7 +11,7 @@ body: |
; CHECK-LABEL: name: mimg_nsa
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) {
; CHECK-NEXT: S_CLAUSE 1
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@ -29,7 +29,7 @@ body: |
; CHECK-LABEL: name: mimg_nsa_mixed
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) {
; CHECK-NEXT: S_CLAUSE 2
; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)

View File

@ -10,7 +10,7 @@ body: |
; CHECK-LABEL: name: mimg
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) {
; CHECK-NEXT: S_CLAUSE 1
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@ -28,7 +28,7 @@ body: |
; CHECK-LABEL: name: mimg_mixed
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 {
; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) {
; CHECK-NEXT: S_CLAUSE 2
; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)

View File

@ -1,13 +1,20 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -stop-after=postrapseudos -o - < %s | FileCheck -enable-var-scope -check-prefix=MIR %s
; MIR-LABEL: name: gws_barrier_offset0{{$}}
; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
; MIR-NEXT: S_WAITCNT 0
; MIR-NEXT: }
define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
; MIR-LABEL: name: gws_barrier_offset0
; MIR: bb.0 (%ir-block.0):
; MIR-NEXT: liveins: $sgpr8_sgpr9
; MIR-NEXT: {{ $}}
; MIR-NEXT: renamable $sgpr4 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset, align 16, addrspace 4)
; MIR-NEXT: $m0 = S_MOV_B32 0
; MIR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec
; MIR-NEXT: BUNDLE implicit killed renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
; MIR-NEXT: S_WAITCNT 0
; MIR-NEXT: }
; MIR-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
ret void
}
@ -17,5 +24,3 @@ declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1
attributes #0 = { nounwind }
attributes #1 = { convergent inaccessiblememonly nounwind }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; MIR: {{.*}}

View File

@ -35,7 +35,7 @@
; LOOP-NEXT: s_cbranch_scc1 [[LOOP]]
; MIR-LABEL: name: gws_barrier_offset0{{$}}
; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec
; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
; MIR-NEXT: S_WAITCNT 0
; MIR-NEXT: }

View File

@ -13,9 +13,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SDAG-GFX11-TRUE16-NEXT: v_dot2_bf16_bf16 v0.l, s2, s3, v0.l
; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@ -26,9 +26,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
; SDAG-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SDAG-GFX11-FAKE16-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1
; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]

View File

@ -12,9 +12,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SDAG-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, s2, s3, v0.l
; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@ -25,9 +25,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; SDAG-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SDAG-GFX11-FAKE16-NEXT: v_dot2_f16_f16 v1, s2, s3, v1
; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
@ -38,9 +38,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GISEL-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GISEL-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GISEL-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, s2, s3, v0.l
; GISEL-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@ -51,9 +51,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GISEL-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GISEL-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GISEL-GFX11-FAKE16-NEXT: v_dot2_f16_f16 v1, s2, s3, v1
; GISEL-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]

View File

@ -17,21 +17,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: v_mov_b32_e32 v4, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
; SDAG-NEXT: v_mov_b32_e32 v5, s16
; SDAG-NEXT: v_mov_b32_e32 v4, 0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
@ -43,13 +41,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
@ -175,16 +172,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
@ -207,16 +203,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
@ -520,21 +515,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
; GCN-NEXT: v_mov_b32_e32 v5, s16
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
; GCN-NEXT: s_nop 7
; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
@ -634,16 +627,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
; GCN-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
@ -802,11 +794,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@ -815,7 +807,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@ -833,12 +824,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@ -965,15 +955,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@ -1003,15 +992,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
@ -1317,11 +1305,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@ -1330,7 +1318,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@ -1348,12 +1335,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@ -1481,11 +1467,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@ -1494,7 +1480,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@ -1512,12 +1497,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@ -1645,11 +1629,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@ -1658,7 +1642,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@ -1676,12 +1659,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@ -1809,11 +1791,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@ -1822,7 +1804,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@ -1840,12 +1821,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@ -1972,15 +1952,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@ -2010,15 +1989,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
@ -2323,15 +2301,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@ -2361,15 +2338,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
@ -2674,15 +2650,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@ -2712,15 +2687,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
@ -3025,15 +2999,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@ -3063,15 +3036,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]

View File

@ -10386,7 +10386,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s2, s0, 0x150
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15]
; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7]
; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11]
; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19]
; GFX8-NEXT: v_mov_b32_e32 v13, s3
; GFX8-NEXT: v_mov_b32_e32 v12, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x140
@ -10395,10 +10396,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v14, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x130
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3]
; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11]
; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19]
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_mov_b32_e32 v17, s3
; GFX8-NEXT: v_mov_b32_e32 v16, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x120
@ -10406,20 +10403,21 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v19, s3
; GFX8-NEXT: v_mov_b32_e32 v18, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x110
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo
; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi
; GFX8-NEXT: v_mov_b32_e32 v6, s4
; GFX8-NEXT: v_mov_b32_e32 v7, s5
; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23]
; GFX8-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NEXT: v_mov_b32_e32 v3, s11
; GFX8-NEXT: v_mov_b32_e32 v9, s13
; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27]
; GFX8-NEXT: v_mov_b32_e32 v10, s14
; GFX8-NEXT: v_mov_b32_e32 v11, s15
; GFX8-NEXT: flat_store_dwordx4 v[56:57], v[28:31]
@ -10588,6 +10586,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23]
; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27]
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;

View File

@ -4582,18 +4582,18 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v2
; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v20, v2, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
; GCN-HSA-NEXT: s_waitcnt vmcnt(11)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9

View File

@ -3313,12 +3313,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6
; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v6
@ -3726,7 +3726,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: s_nop 0
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: s_nop 0
; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
@ -3740,7 +3739,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v26
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v27
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v28
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(12)
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(11)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1
@ -3749,6 +3748,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v1
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v2
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v3
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23
@ -3758,7 +3758,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v21
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v21
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v22
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:176

View File

@ -7788,19 +7788,18 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v13, 16, 8
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v35, v14, 8, 8
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xff, v14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v16, v17, 8, 8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xff, v17
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 24, v17
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v52, v17, 16, 8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v53
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v53
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v53
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v53
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:128
@ -7810,7 +7809,7 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_nop 0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v53
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm

View File

@ -3172,27 +3172,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32:
; VI-NO-DS128: ; %bb.0:
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; VI-NO-DS128-NEXT: s_mov_b32 s90, -1
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s1
; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v16 offset1:1
; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:2 offset1:3
; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000
; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11
; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v18
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10
; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12
@ -3200,7 +3198,6 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v20
; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v18
; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v17
; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v20
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v19
; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v19
@ -3243,17 +3240,19 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v56, 16, v19
; VI-NO-DS128-NEXT: v_and_b32_e32 v55, 0xffff, v19
; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v20
; VI-NO-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v20
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22
; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v19
; VI-NO-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v19
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v18
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v18
; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, s0
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22
; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v21
; VI-NO-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v21
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v60, 16, v24
@ -3296,21 +3295,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v56, s1
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v56 offset1:1
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v56 offset0:2 offset1:3
; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
; GFX9-NO-DS128-NEXT: s_nop 0
; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7
; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v15
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12
@ -3337,9 +3332,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v22
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11
; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
; GFX9-NO-DS128-NEXT: s_nop 0
; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v17
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v17
@ -3360,16 +3357,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v55, 16, v22
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v54, 0xffff, v22
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v24
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v24
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v63, 16, v23
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v62, 0xffff, v23
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v22
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v22
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v19
@ -3806,9 +3804,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16
; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000
; VI-DS128-NEXT: s_add_u32 s88, s88, s11
; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11
; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10
; VI-DS128-NEXT: v_mov_b32_e32 v4, v3
; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11
@ -3825,23 +3825,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18
; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17
; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16
; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17
; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23
; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22
; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21
@ -3850,21 +3843,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22
; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21
; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27
; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26
; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25
; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24
; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27
; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26
; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25
; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
@ -3875,16 +3872,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36
; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39
; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38
; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
; VI-DS128-NEXT: v_mov_b32_e32 v24, s0
; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36
; VI-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58
; VI-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57
@ -3943,9 +3941,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v0
; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16
; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v3
; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11
@ -3964,24 +3964,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18
; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_nop 0
; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16
; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17
; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21
@ -3990,21 +3982,26 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22
; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21
; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24
; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27
; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26
; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25
; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_nop 0
; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
@ -4015,16 +4012,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36
; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39
; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38
; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0
; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57
@ -4197,29 +4195,20 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
; VI-NO-DS128: ; %bb.0:
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; VI-NO-DS128-NEXT: s_mov_b32 s90, -1
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: v_mov_b32_e32 v28, s1
; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1
; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000
; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11
; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30
; VI-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29
@ -4229,7 +4218,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31
; VI-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16
; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34
; VI-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33
@ -4247,16 +4236,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31
; VI-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16
; VI-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32
; VI-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31
; VI-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16
; VI-NO-DS128-NEXT: v_mov_b32_e32 v30, s0
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14
@ -4316,23 +4313,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v28, s1
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
; GFX9-NO-DS128-NEXT: s_nop 0
; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30
; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29
@ -4342,7 +4330,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31
; GFX9-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34
; GFX9-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33
@ -4360,16 +4348,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31
; GFX9-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
; GFX9-NO-DS128-NEXT: s_nop 0
; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32
; GFX9-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31
; GFX9-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v30, s0
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14
@ -4857,10 +4853,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_mov_b32_e32 v32, s1
; VI-DS128-NEXT: ds_read_b128 v[8:11], v32
; VI-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16
; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000
; VI-DS128-NEXT: s_add_u32 s88, s88, s11
; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10
; VI-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16
@ -4873,12 +4871,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8
; VI-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16
; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19
; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18
@ -4899,8 +4891,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64
; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80
; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96
; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25
; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
@ -4913,14 +4908,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
; VI-DS128-NEXT: v_mov_b32_e32 v32, s0
; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
; VI-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34
; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33
; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16
@ -4985,9 +4981,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s1
; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v32
; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10
; GFX9-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16
@ -5001,13 +4999,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8
; GFX9-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16
; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_nop 0
; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18
@ -5028,8 +5019,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64
; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80
; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96
; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_nop 0
; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
@ -5042,14 +5037,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
; GFX9-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33
; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16

View File

@ -15,24 +15,23 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr
; GFX12-NEXT: s_mov_b32 s9, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s6, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_mov_b32 s8, s1
; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s4, s3
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_mov_b32 s4, s3
; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_mov_b32 s2, s1
; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-NEXT: s_wait_loadcnt 0x0
@ -63,10 +62,10 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s4, s3
@ -100,25 +99,24 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
; GFX12-NEXT: s_mov_b32 s9, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s6, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_mov_b32 s8, s1
; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s4, s3
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_mov_b32 s4, s3
; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_mov_b32 s2, s1
; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen
@ -141,24 +139,23 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7)
; GFX12-NEXT: s_mov_b32 s9, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s6, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_mov_b32 s8, s1
; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s4, s3
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_mov_b32 s4, s3
; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_mov_b32 s2, s1
; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-NEXT: s_wait_loadcnt 0x0

View File

@ -128,10 +128,10 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
; GFX10-SDAG-NEXT: s_mov_b32 s11, s2
; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11]
; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc
; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc
; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SDAG-NEXT: s_mov_b32 s5, s10
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@ -181,24 +181,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX11-SDAG-NEXT: s_mov_b32 s9, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_mov_b32 s6, s3
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX11-SDAG-NEXT: s_mov_b32 s8, s1
; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-SDAG-NEXT: s_mov_b32 s5, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0
; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
; GFX11-SDAG-NEXT: s_mov_b32 s2, s1
; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
@ -215,12 +214,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX11-GISEL-NEXT: s_mov_b32 s8, s1
; GFX11-GISEL-NEXT: s_mov_b32 s9, s2
; GFX11-GISEL-NEXT: s_mov_b32 s10, s3
; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
; GFX11-GISEL-NEXT: s_mov_b32 s4, s1
; GFX11-GISEL-NEXT: s_mov_b32 s5, s2
; GFX11-GISEL-NEXT: s_mov_b32 s6, s3
@ -239,24 +238,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX12-SDAG-NEXT: s_mov_b32 s9, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_mov_b32 s6, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: s_mov_b32 s8, s1
; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-SDAG-NEXT: s_mov_b32 s5, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0
; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
; GFX12-SDAG-NEXT: s_mov_b32 s2, s1
; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
@ -273,12 +271,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX12-GISEL-NEXT: s_mov_b32 s8, s1
; GFX12-GISEL-NEXT: s_mov_b32 s9, s2
; GFX12-GISEL-NEXT: s_mov_b32 s10, s3
; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
; GFX12-GISEL-NEXT: s_mov_b32 s4, s1
; GFX12-GISEL-NEXT: s_mov_b32 s5, s2
; GFX12-GISEL-NEXT: s_mov_b32 s6, s3
@ -413,11 +411,11 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
; GFX10-SDAG-NEXT: s_mov_b32 s11, s2
; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11]
; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SDAG-NEXT: s_mov_b32 s5, s10
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@ -468,25 +466,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-SDAG-NEXT: s_mov_b32 s9, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_mov_b32 s6, s3
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX11-SDAG-NEXT: s_mov_b32 s8, s1
; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-SDAG-NEXT: s_mov_b32 s5, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0
; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
; GFX11-SDAG-NEXT: s_mov_b32 s2, s1
; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
@ -503,13 +500,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-GISEL-NEXT: s_mov_b32 s8, s1
; GFX11-GISEL-NEXT: s_mov_b32 s9, s2
; GFX11-GISEL-NEXT: s_mov_b32 s10, s3
; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_mov_b32 s4, s1
; GFX11-GISEL-NEXT: s_mov_b32 s5, s2
; GFX11-GISEL-NEXT: s_mov_b32 s6, s3
@ -528,25 +525,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-SDAG-NEXT: s_mov_b32 s9, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_mov_b32 s6, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: s_mov_b32 s8, s1
; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-SDAG-NEXT: s_mov_b32 s5, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0
; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
; GFX12-SDAG-NEXT: s_mov_b32 s2, s1
; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
@ -563,13 +559,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-GISEL-NEXT: s_mov_b32 s8, s1
; GFX12-GISEL-NEXT: s_mov_b32 s9, s2
; GFX12-GISEL-NEXT: s_mov_b32 s10, s3
; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_mov_b32 s4, s1
; GFX12-GISEL-NEXT: s_mov_b32 s5, s2
; GFX12-GISEL-NEXT: s_mov_b32 s6, s3

View File

@ -774,9 +774,9 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
; GFX1250-NEXT: s_load_b32 s6, s[0:1], 0x0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_max_u32_e32 v0, s6, v0

File diff suppressed because it is too large Load Diff

View File

@ -90,19 +90,19 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh
; GFX12-GISEL-NEXT: s_load_b256 s[20:27], s[2:3], 0x40
; GFX12-GISEL-NEXT: s_load_b512 s[36:51], s[2:3], 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX12-GISEL-NEXT: buffer_load_b32 v2, off, s[16:19], null
; GFX12-GISEL-NEXT: buffer_load_b32 v3, off, s[20:23], null
; GFX12-GISEL-NEXT: buffer_load_b32 v4, off, s[40:43], null
; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX12-GISEL-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x2
; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1
; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x1
; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s1, 0xac0, v3
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1
; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0

View File

@ -185,44 +185,47 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_f32_e32 v4, s43, v4
; GFX900-NEXT: v_add_f32_e32 v3, s42, v3
; GFX900-NEXT: v_add_f32_e32 v2, s41, v2
; GFX900-NEXT: v_add_f32_e32 v1, s40, v1
; GFX900-NEXT: s_waitcnt vmcnt(6)
; GFX900-NEXT: v_add_f32_e32 v8, s39, v8
; GFX900-NEXT: v_add_f32_e32 v7, s38, v7
; GFX900-NEXT: v_add_f32_e32 v6, s37, v6
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_f32_e32 v32, s19, v32
; GFX900-NEXT: v_add_f32_e32 v31, s18, v31
; GFX900-NEXT: v_add_f32_e32 v30, s17, v30
; GFX900-NEXT: v_add_f32_e32 v29, s16, v29
; GFX900-NEXT: s_waitcnt vmcnt(4)
; GFX900-NEXT: v_add_f32_e32 v8, s39, v8
; GFX900-NEXT: v_add_f32_e32 v7, s38, v7
; GFX900-NEXT: v_add_f32_e32 v6, s37, v6
; GFX900-NEXT: v_add_f32_e32 v5, s36, v5
; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_add_f32_e32 v12, s51, v12
; GFX900-NEXT: v_add_f32_e32 v11, s50, v11
; GFX900-NEXT: v_add_f32_e32 v10, s49, v10
; GFX900-NEXT: v_add_f32_e32 v9, s48, v9
; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_add_f32_e32 v16, s47, v16
; GFX900-NEXT: v_add_f32_e32 v15, s46, v15
; GFX900-NEXT: v_add_f32_e32 v14, s45, v14
; GFX900-NEXT: v_add_f32_e32 v13, s44, v13
; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_add_f32_e32 v20, s15, v20
; GFX900-NEXT: v_add_f32_e32 v19, s14, v19
; GFX900-NEXT: v_add_f32_e32 v18, s13, v18
; GFX900-NEXT: v_add_f32_e32 v17, s12, v17
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_f32_e32 v24, s11, v24
; GFX900-NEXT: v_add_f32_e32 v23, s10, v23
; GFX900-NEXT: v_add_f32_e32 v22, s9, v22
@ -246,6 +249,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
@ -255,9 +260,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[40:41]
; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[42:43]
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
@ -293,6 +296,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@ -302,9 +307,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[36:37]
; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[38:39]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
@ -340,11 +343,14 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fadd_v32_vs:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_clause 0x2
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
; GFX1250-SDAG-NEXT: s_clause 0x7
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
@ -354,22 +360,18 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
@ -409,6 +411,9 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
@ -421,10 +426,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
@ -1442,44 +1443,47 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4
; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3
; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2
; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1
; GFX900-NEXT: s_waitcnt vmcnt(6)
; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8
; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7
; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32
; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31
; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30
; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29
; GFX900-NEXT: s_waitcnt vmcnt(4)
; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8
; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7
; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6
; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5
; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12
; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11
; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10
; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9
; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16
; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15
; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14
; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13
; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20
; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19
; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18
; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24
; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23
; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22
@ -1503,6 +1507,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
@ -1512,9 +1518,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[40:41]
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[42:43]
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
@ -1550,6 +1554,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@ -1559,9 +1565,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[36:37]
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[38:39]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
@ -1597,11 +1601,14 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fmul_v32_vs:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_clause 0x2
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
; GFX1250-SDAG-NEXT: s_clause 0x7
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
@ -1611,22 +1618,18 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
@ -1666,6 +1669,9 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
@ -1678,10 +1684,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
@ -2273,44 +2275,47 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43
; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42
; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41
; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40
; GFX900-NEXT: s_waitcnt vmcnt(6)
; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39
; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38
; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19
; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18
; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17
; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16
; GFX900-NEXT: s_waitcnt vmcnt(4)
; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39
; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38
; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37
; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36
; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51
; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50
; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49
; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48
; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47
; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46
; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45
; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44
; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15
; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14
; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13
; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11
; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10
; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9
@ -2334,6 +2339,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
@ -2343,9 +2350,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41]
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43]
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
@ -2381,6 +2386,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@ -2390,9 +2397,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37]
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
@ -2430,6 +2435,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
@ -2442,10 +2450,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[40:41]
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[42:43]
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[50:51]
@ -2496,6 +2500,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
@ -2508,10 +2515,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]

View File

@ -56,11 +56,11 @@ body: |
; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
; GCN-NEXT: }
; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (load (s32)) {
; GCN-NEXT: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: }
; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (store (s128)) {
; GCN-NEXT: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128))
; GCN-NEXT: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128))
; GCN-NEXT: }
@ -359,6 +359,7 @@ tracksRegLiveness: true
body: |
bb.0:
; GCN-LABLE: name: no_sched_barrier_within_bundle
; GCN-LABEL: name: no_sched_barrier_within_bundle
; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {

View File

@ -9,7 +9,7 @@ body: |
; GFX12-LABEL: name: post_bundle_vimage
; GFX12: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) {
; GFX12-NEXT: $vgpr5 = IMAGE_LOAD_V1_V1_gfx12 $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
; GFX12-NEXT: $vgpr4 = IMAGE_LOAD_V1_V1_gfx12 killed $vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
; GFX12-NEXT: }
@ -25,7 +25,7 @@ body: |
; GFX12-LABEL: name: post_bundle_vsample
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 {
; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 :: (dereferenceable load (s128), addrspace 8) {
; GFX12-NEXT: $vgpr6_vgpr7_vgpr8_vgpr9 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
; GFX12-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr2, killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
; GFX12-NEXT: }

View File

@ -398,11 +398,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[4:5]
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffd800, v2
; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe000, v2
; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5]
@ -514,10 +514,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: ; => This Inner Loop Header: Depth=2
; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v2
; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc
; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off
@ -526,13 +524,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s3, v2
; GFX900-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
; GFX900-NEXT: s_addk_i32 s5, 0x2000
; GFX900-NEXT: s_cmp_gt_u32 s5, 0x3fffff
; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_co_u32_e32 v22, vcc, v8, v4
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
; GFX900-NEXT: global_load_dwordx2 v[8:9], v[14:15], off offset:-4096
; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_co_u32_e64 v24, s[0:1], v18, v22
; GFX900-NEXT: v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1]
; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
@ -540,13 +540,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s4, v2
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048
; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: s_waitcnt vmcnt(7)
; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, v20, v24
; GFX900-NEXT: global_load_dwordx2 v[14:15], v[2:3], off
; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, v21, v25, vcc
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: s_waitcnt vmcnt(7)
; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v16, v20
; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v21, vcc
; GFX900-NEXT: s_waitcnt vmcnt(4)
@ -734,10 +734,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc
; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off
; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[14:15], off
@ -753,39 +751,42 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[22:23], off offset:-2048
; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off
; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
; GFX90A-NEXT: s_addk_i32 s3, 0x2000
; GFX90A-NEXT: s_cmp_gt_u32 s3, 0x3fffff
; GFX90A-NEXT: s_waitcnt vmcnt(8)
; GFX90A-NEXT: s_waitcnt vmcnt(10)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(7)
; GFX90A-NEXT: s_waitcnt vmcnt(9)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(6)
; GFX90A-NEXT: s_waitcnt vmcnt(8)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(5)
; GFX90A-NEXT: s_waitcnt vmcnt(7)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(4)
; GFX90A-NEXT: s_waitcnt vmcnt(6)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v24, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v25, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(3)
; GFX90A-NEXT: s_waitcnt vmcnt(5)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v26, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v27, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: s_waitcnt vmcnt(4)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: s_waitcnt vmcnt(3)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2

File diff suppressed because it is too large Load Diff

View File

@ -3,9 +3,6 @@
define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) {
; CHECK-LABEL: excess_soft_clause_reg_pressure:
; CHECK: BB0_1: ; %for.cond28.preheader
; CHECK: s_load_dwordx16
; CHECK-NEXT: s_load_dwordx16
; CHECK: global_load_dword
; CHECK-NEXT: global_load_dword
; CHECK-NEXT: global_load_dword
@ -18,11 +15,23 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspa
; CHECK-NOT: v_readlane_b32
; CHECK: s_load_dwordx16
; CHECK: s_load_dwordx16
; CHECK: s_load_dwordx16
; CHECK-NEXT: s_load_dwordx16
; CHECK-NOT: v_writelane_b32
; CHECK-NOT: v_readlane_b32
; CHECK: s_load_dwordx16
; CHECK-NEXT: s_load_dwordx16
; CHECK-NOT: v_writelane_b32
; CHECK-NOT: v_readlane_b32
; CHECK: s_load_dwordx16
; CHECK-NEXT: s_load_dwordx16
; CHECK-NOT: v_writelane_b32
; CHECK-NOT: v_readlane_b32
entry:
%i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%i2 = load i64, ptr addrspace(4) %i, align 8

View File

@ -448,13 +448,13 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3]
; GFX90A-NEXT: s_nop 4
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
; GFX90A-NEXT: buffer_load_dword v2, off, s[8:11], 0 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART

View File

@ -10314,7 +10314,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:224
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040
@ -10327,12 +10328,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:192
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:176
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:160
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:128
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:112
@ -10344,7 +10343,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:80
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0
@ -10358,10 +10359,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39]
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1
@ -10468,13 +10466,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:224
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:208
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:192
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[15:18], s[36:37] offset:176
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(3)
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:160
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010

View File

@ -295,9 +295,9 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_writelane_b32 v40, s34, 3
; GCN-NEXT: s_mov_b32 s34, s32
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:1024
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34

View File

@ -98,28 +98,29 @@ body: |
; CHECK-LABEL: name: foo
; CHECK: liveins: $q0, $r0, $r1, $r2, $lr
; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
; CHECK: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7
; CHECK: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2)
; CHECK: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1)
; CHECK: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0)
; CHECK: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr {
; CHECK: MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr
; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4)
; CHECK: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4)
; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4)
; CHECK: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4)
; CHECK: }
; CHECK: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 {
; CHECK: MVE_VPST 4, implicit $vpr
; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4)
; CHECK: MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4)
; CHECK: }
; CHECK: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8
; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $r7, -8
; CHECK-NEXT: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $r7
; CHECK-NEXT: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2)
; CHECK-NEXT: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1)
; CHECK-NEXT: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0)
; CHECK-NEXT: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr :: (load (s128) from %ir.src, align 4), (store (s128) into %ir.dest, align 4), (load (s128) from %ir.src2, align 4), (store (s128) into %ir.dest2, align 4) {
; CHECK-NEXT: MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr
; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4)
; CHECK-NEXT: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4)
; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4)
; CHECK-NEXT: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4)
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 :: (load (s128) from %ir.src3, align 4), (store (s128) into %ir.dest3, align 4) {
; CHECK-NEXT: MVE_VPST 4, implicit $vpr
; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4)
; CHECK-NEXT: MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4)
; CHECK-NEXT: }
; CHECK-NEXT: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0
$sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r7, killed $lr
frame-setup CFI_INSTRUCTION def_cfa_offset 8
frame-setup CFI_INSTRUCTION offset $lr, -4