Reland: CodeGen: Record MMOs in finalizeBundle (#166689)
(original PR: #166210) This allows more accurate alias analysis to apply at the bundle level. This has a bunch of minor effects in post-RA scheduling that look mostly beneficial to me, all of them in AMDGPU (the Thumb2 change is cosmetic). The pre-existing (and unchanged) test in CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll tests that MIR with a bundle with MMOs can be parsed successfully. v2: - use cloneMergedMemRefs - add another test to explicitly check the MMO bundling behavior v3: - use poison instead of undef to initialize the global variable in the test v4: - treat bundle memory accesses as never trivially disjoint
This commit is contained in:
parent
28c6ed5914
commit
fa050eadab
@ -1161,6 +1161,8 @@ bool MIParser::parse(MachineInstr *&MI) {
|
||||
MemOperands.push_back(MemOp);
|
||||
if (Token.isNewlineOrEOF())
|
||||
break;
|
||||
if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace))
|
||||
break;
|
||||
if (Token.isNot(MIToken::comma))
|
||||
return error("expected ',' before the next machine memory operand");
|
||||
lex();
|
||||
|
||||
@ -143,6 +143,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
|
||||
SmallSet<Register, 8> KilledUseSet;
|
||||
SmallSet<Register, 8> UndefUseSet;
|
||||
SmallVector<std::pair<Register, Register>> TiedOperands;
|
||||
SmallVector<MachineInstr *> MemMIs;
|
||||
for (auto MII = FirstMI; MII != LastMI; ++MII) {
|
||||
// Debug instructions have no effects to track.
|
||||
if (MII->isDebugInstr())
|
||||
@ -206,6 +207,9 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
|
||||
MIB.setMIFlag(MachineInstr::FrameSetup);
|
||||
if (MII->getFlag(MachineInstr::FrameDestroy))
|
||||
MIB.setMIFlag(MachineInstr::FrameDestroy);
|
||||
|
||||
if (MII->mayLoadOrStore())
|
||||
MemMIs.push_back(&*MII);
|
||||
}
|
||||
|
||||
for (Register Reg : LocalDefs) {
|
||||
@ -231,6 +235,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
|
||||
assert(UseIdx < ExternUses.size());
|
||||
MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
|
||||
}
|
||||
|
||||
MIB->cloneMergedMemRefs(MF, MemMIs);
|
||||
}
|
||||
|
||||
/// finalizeBundle - Same functionality as the previous finalizeBundle except
|
||||
|
||||
@ -3917,6 +3917,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
|
||||
if (isLDSDMA(MIa) || isLDSDMA(MIb))
|
||||
return false;
|
||||
|
||||
if (MIa.isBundle() || MIb.isBundle())
|
||||
return false;
|
||||
|
||||
// TODO: Should we check the address space from the MachineMemOperand? That
|
||||
// would allow us to distinguish objects we know don't alias based on the
|
||||
// underlying address space, even if it was lowered to a different one,
|
||||
|
||||
@ -189,15 +189,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s1, 24
|
||||
; GFX10-NEXT: s_lshr_b32 s8, s2, 16
|
||||
; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
|
||||
; GFX10-NEXT: s_lshr_b32 s5, s5, 8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, s0
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s7, 8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, s6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s9, 8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, s5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, s0
|
||||
; GFX10-NEXT: ds_write_b8 v1, v0
|
||||
@ -208,18 +204,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
|
||||
; GFX10-NEXT: ds_write_b8 v1, v8 offset:1
|
||||
; GFX10-NEXT: ds_write_b8 v1, v9 offset:5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v10, s1
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s1, 24
|
||||
; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s2, 24
|
||||
; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
|
||||
; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
|
||||
; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s9, 8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: s_and_b32 s0, 0xffff, s3
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s3, 16
|
||||
; GFX10-NEXT: v_mov_b32_e32 v10, s1
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s0, 8
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s3, 16
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
|
||||
; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
|
||||
; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s3, 24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, s1
|
||||
|
||||
@ -272,10 +272,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
|
||||
; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
||||
; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
|
||||
; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
|
||||
; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
|
||||
; GFX906-NEXT: s_nop 0
|
||||
; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
|
||||
; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
|
||||
; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
|
||||
; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
|
||||
; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
|
||||
@ -288,6 +284,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
|
||||
; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
|
||||
; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
|
||||
; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
|
||||
; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
|
||||
; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
|
||||
; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
|
||||
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
||||
; GFX906-NEXT: s_cbranch_execz .LBB6_2
|
||||
; GFX906-NEXT: ; %bb.1: ; %bb.1
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -17964,14 +17964,6 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
|
||||
; VI-LABEL: bitcast_v40i8_to_v20i16:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; VI-NEXT: v_mov_b32_e32 v34, v10
|
||||
; VI-NEXT: v_mov_b32_e32 v33, v8
|
||||
; VI-NEXT: v_mov_b32_e32 v35, v6
|
||||
@ -17988,6 +17980,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20
|
||||
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12
|
||||
; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4
|
||||
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; VI-NEXT: v_mov_b32_e32 v31, v14
|
||||
; VI-NEXT: v_mov_b32_e32 v37, v12
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1
|
||||
@ -18005,17 +18005,15 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29
|
||||
; VI-NEXT: s_waitcnt vmcnt(9)
|
||||
; VI-NEXT: s_waitcnt vmcnt(14)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0
|
||||
; VI-NEXT: s_waitcnt vmcnt(8)
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(7)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(5)
|
||||
; VI-NEXT: s_waitcnt vmcnt(13)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6
|
||||
; VI-NEXT: s_waitcnt vmcnt(4)
|
||||
; VI-NEXT: s_waitcnt vmcnt(12)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: s_waitcnt vmcnt(11)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10
|
||||
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
@ -18046,7 +18044,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_waitcnt vmcnt(8)
|
||||
; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
@ -18101,14 +18099,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0x300
|
||||
; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: s_waitcnt vmcnt(10)
|
||||
; VI-NEXT: v_add_u16_e32 v0, 3, v54
|
||||
; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: s_waitcnt vmcnt(9)
|
||||
; VI-NEXT: v_add_u16_e32 v0, 3, v53
|
||||
; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_waitcnt vmcnt(8)
|
||||
; VI-NEXT: v_add_u16_e32 v0, 3, v51
|
||||
; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; VI-NEXT: v_add_u16_e32 v0, 3, v30
|
||||
@ -23918,18 +23916,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v40i8_to_v20f16:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v36, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v31, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v35, v0
|
||||
@ -23943,6 +23929,18 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20
|
||||
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5
|
||||
@ -23974,20 +23972,16 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
|
||||
; SI-NEXT: ; implicit-def: $vgpr15
|
||||
; SI-NEXT: ; implicit-def: $vgpr17
|
||||
; SI-NEXT: ; implicit-def: $vgpr19
|
||||
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14) expcnt(4)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(8)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0)
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v4
|
||||
; SI-NEXT: ; implicit-def: $vgpr0
|
||||
; SI-NEXT: ; implicit-def: $vgpr2
|
||||
; SI-NEXT: ; implicit-def: $vgpr4
|
||||
; SI-NEXT: s_waitcnt vmcnt(5)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v32
|
||||
; SI-NEXT: s_waitcnt vmcnt(4)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v33
|
||||
; SI-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v34
|
||||
; SI-NEXT: ; implicit-def: $vgpr33
|
||||
; SI-NEXT: ; implicit-def: $vgpr32
|
||||
@ -24027,7 +24021,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
|
||||
; SI-NEXT: v_and_b32_e32 v6, 0xff, v30
|
||||
; SI-NEXT: v_or_b32_e32 v6, v6, v47
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v15, v6
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(12)
|
||||
; SI-NEXT: v_and_b32_e32 v6, 0xff, v50
|
||||
; SI-NEXT: v_or_b32_e32 v6, v6, v56
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v32, v6
|
||||
@ -24105,18 +24099,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
|
||||
; SI-NEXT: v_or_b32_e32 v0, v59, v0
|
||||
; SI-NEXT: v_add_i32_e32 v19, vcc, 0x300, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
|
||||
; SI-NEXT: s_movk_i32 s6, 0x300
|
||||
; SI-NEXT: v_or_b32_e32 v0, v58, v0
|
||||
; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: s_waitcnt vmcnt(13)
|
||||
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
|
||||
; SI-NEXT: v_or_b32_e32 v0, v57, v0
|
||||
; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(12)
|
||||
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
|
||||
; SI-NEXT: v_or_b32_e32 v0, v56, v0
|
||||
@ -24232,14 +24225,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
|
||||
; VI-LABEL: bitcast_v40i8_to_v20f16:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; VI-NEXT: v_mov_b32_e32 v34, v10
|
||||
; VI-NEXT: v_mov_b32_e32 v33, v8
|
||||
; VI-NEXT: v_mov_b32_e32 v35, v6
|
||||
@ -24256,6 +24241,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20
|
||||
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12
|
||||
; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4
|
||||
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; VI-NEXT: v_mov_b32_e32 v31, v14
|
||||
; VI-NEXT: v_mov_b32_e32 v37, v12
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1
|
||||
@ -24273,17 +24266,15 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29
|
||||
; VI-NEXT: s_waitcnt vmcnt(9)
|
||||
; VI-NEXT: s_waitcnt vmcnt(14)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0
|
||||
; VI-NEXT: s_waitcnt vmcnt(8)
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(7)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(5)
|
||||
; VI-NEXT: s_waitcnt vmcnt(13)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6
|
||||
; VI-NEXT: s_waitcnt vmcnt(4)
|
||||
; VI-NEXT: s_waitcnt vmcnt(12)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: s_waitcnt vmcnt(11)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10
|
||||
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
@ -24314,7 +24305,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_waitcnt vmcnt(8)
|
||||
; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
@ -24369,14 +24360,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0x300
|
||||
; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: s_waitcnt vmcnt(10)
|
||||
; VI-NEXT: v_add_u16_e32 v0, 3, v54
|
||||
; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: s_waitcnt vmcnt(9)
|
||||
; VI-NEXT: v_add_u16_e32 v0, 3, v53
|
||||
; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_waitcnt vmcnt(8)
|
||||
; VI-NEXT: v_add_u16_e32 v0, 3, v51
|
||||
; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; VI-NEXT: v_add_u16_e32 v0, 3, v30
|
||||
@ -28252,15 +28243,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v40i8_to_v5f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v36, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v35, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v34, v6
|
||||
@ -28277,6 +28259,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
|
||||
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v38, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v37, v12
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
@ -28295,17 +28286,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29
|
||||
; SI-NEXT: s_waitcnt vmcnt(9)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(8)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(7)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4
|
||||
; SI-NEXT: s_waitcnt vmcnt(5)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6
|
||||
; SI-NEXT: s_waitcnt vmcnt(4)
|
||||
; SI-NEXT: s_waitcnt vmcnt(13)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8
|
||||
; SI-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NEXT: s_waitcnt vmcnt(12)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10
|
||||
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
@ -28368,7 +28356,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
|
||||
; SI-NEXT: v_or_b32_e32 v8, v25, v8
|
||||
; SI-NEXT: v_or_b32_e32 v7, v7, v8
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(9)
|
||||
; SI-NEXT: v_and_b32_e32 v8, 0xff, v50
|
||||
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
|
||||
; SI-NEXT: v_or_b32_e32 v8, v8, v23
|
||||
@ -28508,7 +28496,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
|
||||
; SI-NEXT: v_or_b32_e32 v8, v25, v8
|
||||
; SI-NEXT: v_or_b32_e32 v7, v8, v7
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(9)
|
||||
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50
|
||||
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
|
||||
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
|
||||
@ -28557,15 +28545,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; VI-LABEL: bitcast_v40i8_to_v5f64:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; VI-NEXT: v_mov_b32_e32 v36, v10
|
||||
; VI-NEXT: v_mov_b32_e32 v35, v8
|
||||
; VI-NEXT: v_mov_b32_e32 v34, v6
|
||||
@ -28582,6 +28561,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
|
||||
; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
|
||||
; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
|
||||
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; VI-NEXT: v_mov_b32_e32 v38, v14
|
||||
; VI-NEXT: v_mov_b32_e32 v37, v12
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1
|
||||
@ -28599,17 +28587,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29
|
||||
; VI-NEXT: s_waitcnt vmcnt(9)
|
||||
; VI-NEXT: s_waitcnt vmcnt(14)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0
|
||||
; VI-NEXT: s_waitcnt vmcnt(8)
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(7)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(5)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6
|
||||
; VI-NEXT: s_waitcnt vmcnt(4)
|
||||
; VI-NEXT: s_waitcnt vmcnt(13)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: s_waitcnt vmcnt(12)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10
|
||||
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
@ -28640,7 +28625,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_waitcnt vmcnt(9)
|
||||
; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
@ -28748,7 +28733,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: v_add_u16_e32 v7, 0x300, v7
|
||||
; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v7, v7, v8
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_waitcnt vmcnt(9)
|
||||
; VI-NEXT: v_add_u16_e32 v8, 3, v50
|
||||
; VI-NEXT: v_add_u16_e32 v10, 3, v49
|
||||
; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
@ -28780,15 +28765,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; GFX9-LABEL: bitcast_v40i8_to_v5f64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: v_mov_b32_e32 v36, v10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v35, v8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v34, v6
|
||||
@ -28805,6 +28781,16 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
|
||||
; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
|
||||
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
|
||||
; GFX9-NEXT: s_nop 0
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: v_mov_b32_e32 v38, v14
|
||||
; GFX9-NEXT: v_mov_b32_e32 v37, v12
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1
|
||||
@ -28822,17 +28808,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(9)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(18)
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(17)
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(14)
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(13)
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(12)
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
@ -28863,7 +28849,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(9)
|
||||
; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
@ -28971,7 +28957,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7
|
||||
; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX9-NEXT: v_or_b32_e32 v7, v7, v8
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(9)
|
||||
; GFX9-NEXT: v_add_u16_e32 v8, 3, v50
|
||||
; GFX9-NEXT: v_add_u16_e32 v9, 3, v49
|
||||
; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
@ -32301,15 +32287,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v40i8_to_v5i64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v36, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v35, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v34, v6
|
||||
@ -32326,6 +32303,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
|
||||
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v38, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v37, v12
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
@ -32344,17 +32330,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29
|
||||
; SI-NEXT: s_waitcnt vmcnt(9)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(8)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(7)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4
|
||||
; SI-NEXT: s_waitcnt vmcnt(5)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6
|
||||
; SI-NEXT: s_waitcnt vmcnt(4)
|
||||
; SI-NEXT: s_waitcnt vmcnt(13)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8
|
||||
; SI-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NEXT: s_waitcnt vmcnt(12)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10
|
||||
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
@ -32417,7 +32400,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
|
||||
; SI-NEXT: v_or_b32_e32 v8, v25, v8
|
||||
; SI-NEXT: v_or_b32_e32 v7, v7, v8
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(9)
|
||||
; SI-NEXT: v_and_b32_e32 v8, 0xff, v50
|
||||
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
|
||||
; SI-NEXT: v_or_b32_e32 v8, v8, v23
|
||||
@ -32557,7 +32540,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
|
||||
; SI-NEXT: v_or_b32_e32 v8, v25, v8
|
||||
; SI-NEXT: v_or_b32_e32 v7, v8, v7
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(9)
|
||||
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50
|
||||
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
|
||||
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
|
||||
@ -32606,15 +32589,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; VI-LABEL: bitcast_v40i8_to_v5i64:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; VI-NEXT: v_mov_b32_e32 v36, v10
|
||||
; VI-NEXT: v_mov_b32_e32 v35, v8
|
||||
; VI-NEXT: v_mov_b32_e32 v34, v6
|
||||
@ -32631,6 +32605,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
|
||||
; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
|
||||
; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
|
||||
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; VI-NEXT: v_mov_b32_e32 v38, v14
|
||||
; VI-NEXT: v_mov_b32_e32 v37, v12
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1
|
||||
@ -32648,17 +32631,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29
|
||||
; VI-NEXT: s_waitcnt vmcnt(9)
|
||||
; VI-NEXT: s_waitcnt vmcnt(14)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0
|
||||
; VI-NEXT: s_waitcnt vmcnt(8)
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(7)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(5)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6
|
||||
; VI-NEXT: s_waitcnt vmcnt(4)
|
||||
; VI-NEXT: s_waitcnt vmcnt(13)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: s_waitcnt vmcnt(12)
|
||||
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10
|
||||
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
@ -32689,7 +32669,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_waitcnt vmcnt(9)
|
||||
; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
@ -32797,7 +32777,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; VI-NEXT: v_add_u16_e32 v7, 0x300, v7
|
||||
; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v7, v7, v8
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_waitcnt vmcnt(9)
|
||||
; VI-NEXT: v_add_u16_e32 v8, 3, v50
|
||||
; VI-NEXT: v_add_u16_e32 v10, 3, v49
|
||||
; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
@ -32829,15 +32809,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; GFX9-LABEL: bitcast_v40i8_to_v5i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: v_mov_b32_e32 v36, v10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v35, v8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v34, v6
|
||||
@ -32854,6 +32825,16 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
|
||||
; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
|
||||
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
|
||||
; GFX9-NEXT: s_nop 0
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: v_mov_b32_e32 v38, v14
|
||||
; GFX9-NEXT: v_mov_b32_e32 v37, v12
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1
|
||||
@ -32871,17 +32852,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(9)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(18)
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(17)
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(14)
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(13)
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(12)
|
||||
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
@ -32912,7 +32893,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(9)
|
||||
; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
@ -33020,7 +33001,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7
|
||||
; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX9-NEXT: v_or_b32_e32 v7, v7, v8
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(9)
|
||||
; GFX9-NEXT: v_add_u16_e32 v8, 3, v50
|
||||
; GFX9-NEXT: v_add_u16_e32 v9, 3, v49
|
||||
; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -6164,6 +6164,14 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v36f16_to_v18i32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
||||
@ -6180,36 +6188,28 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
|
||||
@ -6224,14 +6224,12 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
|
||||
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
|
||||
; SI-NEXT: s_waitcnt vmcnt(6)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
|
||||
; SI-NEXT: s_waitcnt vmcnt(5)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
@ -13435,6 +13433,14 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v36f16_to_v18f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
||||
@ -13451,36 +13457,28 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
|
||||
@ -13495,14 +13493,12 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
|
||||
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
|
||||
; SI-NEXT: s_waitcnt vmcnt(6)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
|
||||
; SI-NEXT: s_waitcnt vmcnt(5)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
@ -19656,6 +19652,14 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v36f16_to_v9i64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
||||
@ -19672,36 +19676,28 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
|
||||
@ -19716,14 +19712,12 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
|
||||
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
|
||||
; SI-NEXT: s_waitcnt vmcnt(6)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
|
||||
; SI-NEXT: s_waitcnt vmcnt(5)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
@ -25282,6 +25276,14 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v36f16_to_v9f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
||||
@ -25298,36 +25300,28 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
|
||||
@ -25342,14 +25336,12 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
|
||||
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
|
||||
; SI-NEXT: s_waitcnt vmcnt(6)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
|
||||
; SI-NEXT: s_waitcnt vmcnt(5)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
@ -26798,22 +26790,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v36i16_to_v36f16:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20
|
||||
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12
|
||||
@ -26838,6 +26814,22 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: ; implicit-def: $vgpr48
|
||||
; SI-NEXT: ; kill: killed $vgpr48
|
||||
; SI-NEXT: ; implicit-def: $vgpr48
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
||||
; SI-NEXT: ; implicit-def: $vgpr62
|
||||
; SI-NEXT: ; implicit-def: $vgpr32
|
||||
; SI-NEXT: ; implicit-def: $vgpr63
|
||||
@ -26865,7 +26857,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: ; implicit-def: $vgpr50
|
||||
; SI-NEXT: ; kill: killed $vgpr48
|
||||
; SI-NEXT: ; implicit-def: $vgpr48
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
|
||||
; SI-NEXT: ; implicit-def: $vgpr31
|
||||
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
@ -26892,7 +26884,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v47, v9
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v60, v10
|
||||
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0)
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v39
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v45, v11
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v58, v12
|
||||
@ -26977,7 +26969,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v27
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39
|
||||
; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34
|
||||
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
||||
|
||||
@ -3541,6 +3541,17 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v40i16_to_v20i32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
|
||||
@ -3562,17 +3573,6 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v37, v20
|
||||
; SI-NEXT: v_mov_b32_e32 v38, v18
|
||||
; SI-NEXT: v_mov_b32_e32 v39, v16
|
||||
@ -3594,13 +3594,10 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
|
||||
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: s_waitcnt vmcnt(4)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
|
||||
@ -4914,7 +4911,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
|
||||
@ -4947,7 +4944,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
|
||||
@ -4980,7 +4977,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
|
||||
@ -5073,7 +5070,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
|
||||
@ -5106,7 +5103,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
|
||||
@ -5139,7 +5136,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
|
||||
@ -8520,7 +8517,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
|
||||
@ -8553,7 +8550,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
|
||||
@ -8586,7 +8583,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
|
||||
@ -8679,7 +8676,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
|
||||
@ -8712,7 +8709,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
|
||||
@ -8745,7 +8742,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
|
||||
@ -11740,6 +11737,17 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v40i16_to_v20f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
|
||||
@ -11761,17 +11769,6 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v37, v20
|
||||
; SI-NEXT: v_mov_b32_e32 v38, v18
|
||||
; SI-NEXT: v_mov_b32_e32 v39, v16
|
||||
@ -11793,13 +11790,10 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
|
||||
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: s_waitcnt vmcnt(4)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
|
||||
@ -13113,7 +13107,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
|
||||
@ -13146,7 +13140,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
|
||||
@ -13179,7 +13173,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
|
||||
@ -13272,7 +13266,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
|
||||
@ -13305,7 +13299,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
|
||||
@ -13338,7 +13332,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
|
||||
@ -16833,7 +16827,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
|
||||
@ -16866,7 +16860,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
|
||||
@ -16899,7 +16893,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
|
||||
@ -16992,7 +16986,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
|
||||
@ -17025,7 +17019,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
|
||||
@ -17058,7 +17052,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
|
||||
@ -19249,6 +19243,17 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v40i16_to_v10i64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
|
||||
@ -19270,17 +19275,6 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v37, v20
|
||||
; SI-NEXT: v_mov_b32_e32 v38, v18
|
||||
; SI-NEXT: v_mov_b32_e32 v39, v16
|
||||
@ -19302,13 +19296,10 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
|
||||
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: s_waitcnt vmcnt(4)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
|
||||
@ -20622,7 +20613,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
|
||||
@ -20655,7 +20646,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
|
||||
@ -20688,7 +20679,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
|
||||
@ -20781,7 +20772,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
|
||||
@ -20814,7 +20805,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
|
||||
@ -20847,7 +20838,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
|
||||
@ -24238,7 +24229,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
|
||||
@ -24271,7 +24262,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
|
||||
@ -24304,7 +24295,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
|
||||
@ -24397,7 +24388,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
|
||||
@ -24430,7 +24421,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
|
||||
@ -24463,7 +24454,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
|
||||
@ -25988,6 +25979,17 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v40i16_to_v10f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
|
||||
@ -26009,17 +26011,6 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v37, v20
|
||||
; SI-NEXT: v_mov_b32_e32 v38, v18
|
||||
; SI-NEXT: v_mov_b32_e32 v39, v16
|
||||
@ -26041,13 +26032,10 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
|
||||
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: s_waitcnt vmcnt(4)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
|
||||
@ -27361,7 +27349,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
|
||||
@ -27394,7 +27382,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
|
||||
@ -27427,7 +27415,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
|
||||
@ -27520,7 +27508,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
|
||||
@ -27553,7 +27541,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
|
||||
@ -27586,7 +27574,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
|
||||
@ -31014,7 +31002,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
|
||||
@ -31047,7 +31035,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
|
||||
@ -31080,7 +31068,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
|
||||
@ -31173,7 +31161,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
|
||||
@ -31206,7 +31194,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
|
||||
@ -31239,7 +31227,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
|
||||
@ -31389,6 +31377,17 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v40i16_to_v40f16:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
|
||||
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36
|
||||
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28
|
||||
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24
|
||||
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20
|
||||
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
|
||||
@ -31405,17 +31404,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
|
||||
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36
|
||||
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28
|
||||
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24
|
||||
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20
|
||||
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: ; implicit-def: $vgpr40
|
||||
; SI-NEXT: ; kill: killed $vgpr40
|
||||
; SI-NEXT: ; implicit-def: $vgpr40
|
||||
@ -31472,7 +31460,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: ; implicit-def: $vgpr42
|
||||
; SI-NEXT: ; kill: killed $vgpr40
|
||||
; SI-NEXT: ; implicit-def: $vgpr40
|
||||
; SI-NEXT: s_waitcnt vmcnt(8)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
|
||||
; SI-NEXT: ; implicit-def: $vgpr31
|
||||
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
@ -31523,7 +31511,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v30
|
||||
; SI-NEXT: s_waitcnt vmcnt(7)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v40, v48
|
||||
; SI-NEXT: ; implicit-def: $vgpr2
|
||||
; SI-NEXT: ; implicit-def: $vgpr3
|
||||
@ -31623,7 +31610,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v27
|
||||
; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39
|
||||
; SI-NEXT: s_waitcnt vmcnt(5)
|
||||
; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49
|
||||
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
@ -31643,7 +31629,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v38
|
||||
; SI-NEXT: s_waitcnt vmcnt(8)
|
||||
; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48
|
||||
; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24
|
||||
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
|
||||
|
||||
@ -3792,6 +3792,17 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v44i16_to_v22i32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v39, v16
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
|
||||
@ -3814,17 +3825,6 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v39, v16
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v38, v18
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
|
||||
@ -3842,9 +3842,8 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
|
||||
@ -5329,7 +5328,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
|
||||
@ -5362,7 +5361,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
|
||||
@ -5395,7 +5394,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
|
||||
@ -5496,7 +5495,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
|
||||
@ -5529,7 +5528,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
|
||||
@ -5562,7 +5561,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
|
||||
@ -9311,7 +9310,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
|
||||
@ -9344,7 +9343,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
|
||||
@ -9377,7 +9376,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
|
||||
@ -9478,7 +9477,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
|
||||
@ -9511,7 +9510,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
|
||||
@ -9544,7 +9543,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
|
||||
@ -12755,6 +12754,17 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v44i16_to_v22f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v39, v16
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
|
||||
@ -12777,17 +12787,6 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v39, v16
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v38, v18
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
|
||||
@ -12805,9 +12804,8 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
|
||||
@ -14292,7 +14290,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
|
||||
@ -14325,7 +14323,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
|
||||
@ -14358,7 +14356,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
|
||||
@ -14459,7 +14457,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
|
||||
@ -14492,7 +14490,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
|
||||
@ -14525,7 +14523,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
|
||||
@ -18407,7 +18405,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
|
||||
@ -18440,7 +18438,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
|
||||
@ -18473,7 +18471,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
|
||||
@ -18574,7 +18572,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
|
||||
@ -18607,7 +18605,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
|
||||
@ -18640,7 +18638,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
|
||||
@ -21004,6 +21002,17 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v44i16_to_v11i64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v39, v16
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
|
||||
@ -21026,17 +21035,6 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v39, v16
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v38, v18
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
|
||||
@ -21054,9 +21052,8 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
|
||||
@ -22541,7 +22538,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
|
||||
@ -22574,7 +22571,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
|
||||
@ -22607,7 +22604,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
|
||||
@ -22708,7 +22705,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
|
||||
@ -22741,7 +22738,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
|
||||
@ -22774,7 +22771,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
|
||||
@ -26535,7 +26532,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
|
||||
@ -26568,7 +26565,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
|
||||
@ -26601,7 +26598,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
|
||||
@ -26702,7 +26699,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
|
||||
@ -26735,7 +26732,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
|
||||
@ -26768,7 +26765,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
|
||||
@ -28420,6 +28417,17 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v44i16_to_v11f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v39, v16
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
|
||||
@ -28442,17 +28450,6 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v39, v16
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v38, v18
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
|
||||
@ -28470,9 +28467,8 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
|
||||
@ -29957,7 +29953,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
|
||||
@ -29990,7 +29986,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
|
||||
@ -30023,7 +30019,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
|
||||
@ -30124,7 +30120,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
|
||||
@ -30157,7 +30153,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
|
||||
@ -30190,7 +30186,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
|
||||
@ -33996,7 +33992,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
|
||||
@ -34029,7 +34025,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
|
||||
@ -34062,7 +34058,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
|
||||
@ -34163,7 +34159,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
|
||||
@ -34196,7 +34192,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
|
||||
@ -34229,7 +34225,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
|
||||
|
||||
@ -4045,6 +4045,22 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v48i16_to_v24i32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
|
||||
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
|
||||
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
||||
@ -4069,22 +4085,6 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
|
||||
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
|
||||
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
|
||||
@ -4100,21 +4100,14 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
|
||||
; SI-NEXT: s_waitcnt vmcnt(7)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(6)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(5)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
|
||||
; SI-NEXT: s_waitcnt vmcnt(4)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
|
||||
; SI-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
|
||||
@ -5806,7 +5799,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
|
||||
@ -5839,7 +5832,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
|
||||
@ -5872,7 +5865,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
|
||||
@ -5979,7 +5972,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
|
||||
@ -6012,7 +6005,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
|
||||
@ -6045,7 +6038,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
|
||||
@ -8179,6 +8172,8 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v48f16_to_v24i32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
||||
@ -8195,8 +8190,6 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
|
||||
@ -8223,34 +8216,34 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
|
||||
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
|
||||
@ -10214,7 +10207,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
|
||||
@ -10247,7 +10240,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
|
||||
@ -10280,7 +10273,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
|
||||
@ -10387,7 +10380,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
|
||||
@ -10420,7 +10413,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
|
||||
@ -10453,7 +10446,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
|
||||
@ -13882,6 +13875,22 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v48i16_to_v24f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
|
||||
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
|
||||
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
||||
@ -13906,22 +13915,6 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
|
||||
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
|
||||
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
|
||||
@ -13937,21 +13930,14 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
|
||||
; SI-NEXT: s_waitcnt vmcnt(7)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(6)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(5)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
|
||||
; SI-NEXT: s_waitcnt vmcnt(4)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
|
||||
; SI-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
|
||||
@ -15643,7 +15629,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
|
||||
@ -15676,7 +15662,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
|
||||
@ -15709,7 +15695,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
|
||||
@ -15816,7 +15802,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
|
||||
@ -15849,7 +15835,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
|
||||
@ -15882,7 +15868,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
|
||||
@ -18157,6 +18143,8 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v48f16_to_v24f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
||||
@ -18173,8 +18161,6 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
|
||||
@ -18201,34 +18187,34 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
|
||||
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
|
||||
@ -20192,7 +20178,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
|
||||
@ -20225,7 +20211,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
|
||||
@ -20258,7 +20244,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
|
||||
@ -20365,7 +20351,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
|
||||
@ -20398,7 +20384,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
|
||||
@ -20431,7 +20417,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
|
||||
@ -22982,6 +22968,22 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v48i16_to_v12i64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
|
||||
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
|
||||
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
||||
@ -23006,22 +23008,6 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
|
||||
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
|
||||
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
|
||||
@ -23037,21 +23023,14 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
|
||||
; SI-NEXT: s_waitcnt vmcnt(7)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(6)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(5)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
|
||||
; SI-NEXT: s_waitcnt vmcnt(4)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
|
||||
; SI-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
|
||||
@ -24743,7 +24722,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
|
||||
@ -24776,7 +24755,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
|
||||
@ -24809,7 +24788,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
|
||||
@ -24916,7 +24895,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
|
||||
@ -24949,7 +24928,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
|
||||
@ -24982,7 +24961,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
|
||||
@ -27128,6 +27107,8 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v48f16_to_v12i64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
||||
@ -27144,8 +27125,6 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
|
||||
@ -27172,34 +27151,34 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
|
||||
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
|
||||
@ -29163,7 +29142,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
|
||||
@ -29196,7 +29175,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
|
||||
@ -29229,7 +29208,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
|
||||
@ -29336,7 +29315,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
|
||||
@ -29369,7 +29348,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
|
||||
@ -29402,7 +29381,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
|
||||
@ -31199,6 +31178,22 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v48i16_to_v12f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
|
||||
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
|
||||
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
||||
@ -31223,22 +31218,6 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_mov_b32_e32 v48, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v49, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v50, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v51, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v52, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v53, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v54, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v55, v0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
|
||||
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
|
||||
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
|
||||
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
|
||||
@ -31254,21 +31233,14 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
|
||||
; SI-NEXT: s_waitcnt vmcnt(7)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(6)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(5)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
|
||||
; SI-NEXT: s_waitcnt vmcnt(4)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
|
||||
; SI-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
|
||||
@ -32960,7 +32932,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
|
||||
@ -32993,7 +32965,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
|
||||
@ -33026,7 +32998,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
|
||||
@ -33133,7 +33105,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
|
||||
@ -33166,7 +33138,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
|
||||
@ -33199,7 +33171,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
|
||||
@ -35392,6 +35364,8 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
|
||||
; SI-LABEL: bitcast_v48f16_to_v12f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
|
||||
@ -35408,8 +35382,6 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
|
||||
@ -35436,34 +35408,34 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
|
||||
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
|
||||
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
|
||||
@ -37427,7 +37399,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
|
||||
@ -37460,7 +37432,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
|
||||
@ -37493,7 +37465,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
|
||||
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
|
||||
@ -37600,7 +37572,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
|
||||
@ -37633,7 +37605,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
|
||||
@ -37666,7 +37638,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe
|
||||
; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
|
||||
@ -41255,6 +41227,11 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
|
||||
; SI-LABEL: bitcast_v48f16_to_v48i16_scalar:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
||||
@ -41271,11 +41248,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
|
||||
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
|
||||
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
|
||||
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
|
||||
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
|
||||
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
|
||||
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
|
||||
; SI-NEXT: s_waitcnt expcnt(2)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v61, v2
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v55, v3
|
||||
@ -41320,16 +41292,12 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v50, s25
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v16, s26
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v29, s29
|
||||
; SI-NEXT: s_waitcnt vmcnt(4)
|
||||
; SI-NEXT: s_waitcnt vmcnt(14)
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
|
||||
; SI-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v31, v32
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v43, v33
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v32, v20
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v25, v35
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v2, v37
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v20, s22
|
||||
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -968,14 +968,14 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
|
||||
; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v29, vcc
|
||||
; GFX8-NEXT: s_movk_i32 s4, 0x70
|
||||
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[28:29]
|
||||
; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
|
||||
; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
|
||||
; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
|
||||
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
|
||||
; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
|
||||
; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
|
||||
; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
|
||||
; GFX8-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
|
||||
; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[20:21]
|
||||
; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
|
||||
; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
@ -9552,6 +9552,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
|
||||
; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1
|
||||
; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
|
||||
; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
|
||||
; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
||||
@ -9563,7 +9564,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
|
||||
; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
||||
; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
|
||||
; GFX8-NEXT: flat_load_ushort v44, v[1:2]
|
||||
; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1
|
||||
|
||||
@ -450,23 +450,38 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
|
||||
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
|
||||
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
|
||||
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
|
||||
; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
|
||||
@ -976,23 +991,38 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
|
||||
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
|
||||
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
|
||||
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
|
||||
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
|
||||
; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
|
||||
@ -1159,24 +1189,23 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s9, s12
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s6, s3
|
||||
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s8, s1
|
||||
; SDAG-GFX1100-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2
|
||||
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0
|
||||
; SDAG-GFX1100-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
|
||||
; SDAG-GFX1100-NEXT: s_clause 0x1
|
||||
; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x54
|
||||
; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s5, s12
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3
|
||||
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v5, s0
|
||||
; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12
|
||||
; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s2, s1
|
||||
; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12
|
||||
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
|
||||
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1220,12 +1249,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
|
||||
; GISEL-GFX1100-NEXT: s_mov_b32 s8, s1
|
||||
; GISEL-GFX1100-NEXT: s_mov_b32 s9, s2
|
||||
; GISEL-GFX1100-NEXT: s_mov_b32 s10, s3
|
||||
; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
|
||||
; GISEL-GFX1100-NEXT: s_clause 0x1
|
||||
; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
|
||||
; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x54
|
||||
; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-GFX1100-NEXT: v_mov_b32_e32 v5, s0
|
||||
; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
|
||||
; GISEL-GFX1100-NEXT: s_mov_b32 s4, s1
|
||||
; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2
|
||||
; GISEL-GFX1100-NEXT: s_mov_b32 s6, s3
|
||||
|
||||
@ -4253,6 +4253,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
|
||||
; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
|
||||
; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
|
||||
; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
|
||||
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
||||
@ -4260,7 +4261,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
|
||||
; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
|
||||
; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
|
||||
; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
|
||||
; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
|
||||
; VI-NEXT: s_mov_b32 s38, -1
|
||||
; VI-NEXT: s_mov_b32 s39, 0xe80000
|
||||
; VI-NEXT: s_add_u32 s36, s36, s3
|
||||
@ -4272,7 +4272,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
|
||||
; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
|
||||
; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
|
||||
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
|
||||
; VI-NEXT: s_waitcnt vmcnt(7)
|
||||
; VI-NEXT: s_waitcnt vmcnt(6)
|
||||
; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32
|
||||
; VI-NEXT: s_swappc_b64 s[30:31], s[8:9]
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -4285,6 +4285,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
|
||||
; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
|
||||
; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
|
||||
; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
|
||||
; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
||||
@ -4292,7 +4293,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
|
||||
; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
|
||||
; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
|
||||
; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
|
||||
; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
|
||||
; CI-NEXT: s_mov_b32 s38, -1
|
||||
; CI-NEXT: s_mov_b32 s39, 0xe8f000
|
||||
; CI-NEXT: s_add_u32 s36, s36, s3
|
||||
@ -4304,7 +4304,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
|
||||
; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
|
||||
; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
|
||||
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
|
||||
; CI-NEXT: s_waitcnt vmcnt(7)
|
||||
; CI-NEXT: s_waitcnt vmcnt(6)
|
||||
; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32
|
||||
; CI-NEXT: s_swappc_b64 s[30:31], s[8:9]
|
||||
; CI-NEXT: s_endpgm
|
||||
@ -4317,6 +4317,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
|
||||
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
|
||||
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
|
||||
; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
|
||||
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
||||
@ -4324,7 +4325,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
|
||||
; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
|
||||
; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
|
||||
; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
|
||||
; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
|
||||
; GFX9-NEXT: s_mov_b32 s38, -1
|
||||
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
|
||||
; GFX9-NEXT: s_add_u32 s36, s36, s3
|
||||
@ -4336,7 +4336,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
|
||||
; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
|
||||
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
|
||||
@ -851,12 +851,12 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp
|
||||
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: ds_write_b8 v0, v1 offset:9
|
||||
; CI-NEXT: ds_write_b8 v0, v2 offset:13
|
||||
; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1
|
||||
; CI-NEXT: ds_write_b8 v0, v1 offset:5
|
||||
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
|
||||
; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
|
||||
; CI-NEXT: ds_write_b8 v0, v1 offset:9
|
||||
; CI-NEXT: ds_write_b8 v0, v2 offset:13
|
||||
; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2
|
||||
; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
|
||||
; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
|
||||
|
||||
@ -3755,42 +3755,44 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v13, v22
|
||||
; CI-NEXT: v_or_b32_e32 v10, v14, v10
|
||||
; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4
|
||||
; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
|
||||
; CI-NEXT: v_or_b32_e32 v17, v18, v17
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
|
||||
; CI-NEXT: v_or_b32_e32 v17, v18, v17
|
||||
; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v22, v27
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
|
||||
; CI-NEXT: v_or_b32_e32 v13, v16, v13
|
||||
; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
|
||||
; CI-NEXT: v_or_b32_e32 v19, v20, v19
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v21
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v21, v30
|
||||
; CI-NEXT: v_or_b32_e32 v20, v22, v20
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v22, v29
|
||||
; CI-NEXT: s_waitcnt vmcnt(6)
|
||||
; CI-NEXT: s_waitcnt vmcnt(8)
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
|
||||
; CI-NEXT: s_waitcnt vmcnt(5)
|
||||
; CI-NEXT: s_waitcnt vmcnt(7)
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
|
||||
; CI-NEXT: v_or_b32_e32 v21, v22, v21
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
|
||||
; CI-NEXT: s_waitcnt vmcnt(3)
|
||||
; CI-NEXT: s_waitcnt vmcnt(5)
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
||||
; CI-NEXT: s_waitcnt vmcnt(2)
|
||||
; CI-NEXT: s_waitcnt vmcnt(4)
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
|
||||
@ -3802,6 +3804,27 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
|
||||
; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
|
||||
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
|
||||
; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
|
||||
; CI-NEXT: s_waitcnt vmcnt(6)
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
|
||||
; CI-NEXT: s_waitcnt vmcnt(5)
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
|
||||
; CI-NEXT: v_or_b32_e32 v14, v15, v14
|
||||
; CI-NEXT: s_waitcnt vmcnt(3)
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v15, v16
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v16, v18
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
|
||||
; CI-NEXT: v_or_b32_e32 v12, v12, v15
|
||||
; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0
|
||||
; CI-NEXT: v_or_b32_e32 v11, v16, v11
|
||||
; CI-NEXT: s_waitcnt vmcnt(1)
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -3968,28 +3991,6 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
|
||||
; CI-NEXT: v_or_b32_e32 v31, v32, v31
|
||||
; CI-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0
|
||||
; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
|
||||
; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4
|
||||
; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32
|
||||
; CI-NEXT: s_waitcnt vmcnt(1)
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
|
||||
; CI-NEXT: v_or_b32_e32 v14, v15, v14
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v15, v16
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v16, v18
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
|
||||
; CI-NEXT: v_or_b32_e32 v12, v12, v15
|
||||
; CI-NEXT: v_or_b32_e32 v11, v16, v11
|
||||
; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0
|
||||
; CI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen
|
||||
; CI-NEXT: v_add_i32_e32 v11, vcc, 64, v0
|
||||
; CI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen
|
||||
|
||||
@ -1,6 +1,19 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -passes=finalizebundle-test %s -o - | FileCheck %s
|
||||
|
||||
--- |
|
||||
|
||||
@foo = addrspace(3) global i32 poison
|
||||
|
||||
define void @test_overlap() { unreachable }
|
||||
define void @test_dead_redef() { unreachable }
|
||||
define void @test_tied() { unreachable }
|
||||
define void @test_mmo_merge1() { unreachable }
|
||||
define void @test_mmo_merge2() { unreachable }
|
||||
define void @test_mmo_drop() { unreachable }
|
||||
|
||||
...
|
||||
|
||||
---
|
||||
name: test_overlap
|
||||
body: |
|
||||
@ -47,3 +60,42 @@ body: |
|
||||
%1:vgpr_32 = COPY %0:vgpr_32
|
||||
%2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: test_mmo_merge1
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: test_mmo_merge1
|
||||
; CHECK: BUNDLE implicit-def %0, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3) {
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32
|
||||
; CHECK-NEXT: DS_WRITE_B32_gfx9 %1:vgpr_32, internal [[COPY]], 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
|
||||
; CHECK-NEXT: }
|
||||
%1:vgpr_32 = COPY %0:vgpr_32
|
||||
DS_WRITE_B32_gfx9 %0, %1, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
|
||||
...
|
||||
|
||||
---
|
||||
name: test_mmo_merge2
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: test_mmo_merge2
|
||||
; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3), (store (s32) into @foo + 4, addrspace 3) {
|
||||
; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
|
||||
; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3)
|
||||
; CHECK-NEXT: }
|
||||
DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
|
||||
DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3)
|
||||
...
|
||||
|
||||
---
|
||||
name: test_mmo_drop
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: test_mmo_drop
|
||||
; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec {
|
||||
; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
|
||||
; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec
|
||||
; CHECK-NEXT: }
|
||||
DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
|
||||
DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec
|
||||
...
|
||||
|
||||
@ -807,7 +807,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
|
||||
; GFX10-NEXT: buffer_store_dword v95, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: v_writelane_b32 v100, s31, 1
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX10-NEXT: s_clause 0x1f
|
||||
; GFX10-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX10-NEXT: buffer_load_dword v95, off, s[0:3], s33
|
||||
; GFX10-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:4
|
||||
; GFX10-NEXT: buffer_load_dword v93, off, s[0:3], s33 offset:8
|
||||
@ -863,7 +863,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
|
||||
; GFX11-NEXT: s_mov_b32 s1, return_100xi32@abs32@hi
|
||||
; GFX11-NEXT: s_mov_b32 s0, return_100xi32@abs32@lo
|
||||
; GFX11-NEXT: s_addk_i32 s32, 0x90
|
||||
; GFX11-NEXT: s_clause 0x1f
|
||||
; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
|
||||
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:124
|
||||
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:120
|
||||
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:116
|
||||
@ -898,7 +898,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
|
||||
; GFX11-NEXT: scratch_store_b32 off, v95, s33
|
||||
; GFX11-NEXT: v_writelane_b32 v100, s31, 1
|
||||
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
||||
; GFX11-NEXT: s_clause 0x1f
|
||||
; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
|
||||
; GFX11-NEXT: scratch_load_b32 v95, off, s33
|
||||
; GFX11-NEXT: scratch_load_b32 v94, off, s33 offset:4
|
||||
; GFX11-NEXT: scratch_load_b32 v93, off, s33 offset:8
|
||||
@ -2518,7 +2518,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
|
||||
; GFX11-LABEL: return_72xi32:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_clause 0xc
|
||||
; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Spill
|
||||
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:212
|
||||
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:208
|
||||
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:204
|
||||
@ -2551,23 +2551,23 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
|
||||
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:96
|
||||
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:92
|
||||
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:88
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
|
||||
; GFX11-NEXT: s_clause 0x2
|
||||
; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
|
||||
; GFX11-NEXT: s_clause 0x5
|
||||
; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:112
|
||||
; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:108
|
||||
; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:104
|
||||
; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
|
||||
; GFX11-NEXT: s_clause 0x2
|
||||
; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:128
|
||||
; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:124
|
||||
; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:120
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
|
||||
; GFX11-NEXT: s_clause 0x2
|
||||
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
|
||||
; GFX11-NEXT: s_clause 0x10
|
||||
; GFX11-NEXT: scratch_load_b32 v15, off, s32 offset:144
|
||||
; GFX11-NEXT: scratch_load_b32 v14, off, s32 offset:140
|
||||
; GFX11-NEXT: scratch_load_b32 v13, off, s32 offset:136
|
||||
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
|
||||
; GFX11-NEXT: s_clause 0xd
|
||||
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160
|
||||
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:156
|
||||
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:152
|
||||
@ -2608,7 +2608,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
|
||||
; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
|
||||
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
|
||||
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
|
||||
; GFX11-NEXT: s_clause 0xc
|
||||
; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Reload
|
||||
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:164
|
||||
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:168
|
||||
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:172
|
||||
@ -2641,21 +2641,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
|
||||
; GFX9-NEXT: s_mov_b32 s34, s32
|
||||
; GFX9-NEXT: s_add_i32 s32, s32, 0x28000
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
|
||||
@ -2733,6 +2718,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
|
||||
; GFX9-NEXT: v_mov_b32_e32 v29, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v30, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v31, 0
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: v_writelane_b32 v63, s31, 1
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37]
|
||||
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636
|
||||
@ -2914,21 +2914,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
|
||||
; GFX10-NEXT: s_mov_b32 s38, s34
|
||||
; GFX10-NEXT: s_mov_b32 s34, s32
|
||||
; GFX10-NEXT: s_add_i32 s32, s32, 0x14000
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: v_writelane_b32 v63, s30, 0
|
||||
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
|
||||
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
|
||||
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
|
||||
@ -2971,12 +2957,11 @@ define amdgpu_gfx void @call_72xi32() #1 {
|
||||
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156
|
||||
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160
|
||||
; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
|
||||
; GFX10-NEXT: v_writelane_b32 v63, s30, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, 0
|
||||
@ -3006,6 +2991,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
|
||||
; GFX10-NEXT: v_mov_b32_e32 v31, 0
|
||||
; GFX10-NEXT: s_mov_b32 s37, return_72xi32@abs32@hi
|
||||
; GFX10-NEXT: s_mov_b32 s36, return_72xi32@abs32@lo
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: v_writelane_b32 v63, s31, 1
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37]
|
||||
; GFX10-NEXT: s_clause 0x28
|
||||
@ -3138,7 +3138,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
|
||||
; GFX10-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152
|
||||
; GFX10-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156
|
||||
; GFX10-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160
|
||||
; GFX10-NEXT: s_clause 0x7
|
||||
; GFX10-NEXT: s_clause 0x7 ; 32-byte Folded Reload
|
||||
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1536
|
||||
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1540
|
||||
; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1544
|
||||
@ -3151,7 +3151,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 42
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37]
|
||||
; GFX10-NEXT: s_clause 0xe
|
||||
; GFX10-NEXT: s_clause 0xe ; 60-byte Folded Reload
|
||||
; GFX10-NEXT: buffer_load_dword v62, off, s[0:3], s33
|
||||
; GFX10-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4
|
||||
; GFX10-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8
|
||||
@ -3199,7 +3199,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
|
||||
; GFX11-NEXT: s_mov_b32 s36, s34
|
||||
; GFX11-NEXT: s_mov_b32 s34, s32
|
||||
; GFX11-NEXT: s_addk_i32 s32, 0xa00
|
||||
; GFX11-NEXT: s_clause 0xb
|
||||
; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Spill
|
||||
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:44
|
||||
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:40
|
||||
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:36
|
||||
@ -3341,18 +3341,18 @@ define amdgpu_gfx void @call_72xi32() #1 {
|
||||
; GFX11-NEXT: s_add_i32 s2, s32, 16
|
||||
; GFX11-NEXT: v_mov_b32_e32 v30, v46
|
||||
; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2
|
||||
; GFX11-NEXT: s_clause 0x3
|
||||
; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584
|
||||
; GFX11-NEXT: s_clause 0x3 ; 64-byte Folded Reload
|
||||
; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568
|
||||
; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552
|
||||
; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536
|
||||
; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584
|
||||
; GFX11-NEXT: s_add_i32 s2, s33, 0x400
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v0, s2
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 42
|
||||
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
||||
; GFX11-NEXT: s_clause 0xb
|
||||
; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Reload
|
||||
; GFX11-NEXT: scratch_load_b32 v59, off, s33
|
||||
; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:4
|
||||
; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:8
|
||||
|
||||
@ -255,11 +255,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
|
||||
; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16
|
||||
; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off
|
||||
; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:64
|
||||
; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70
|
||||
; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 48
|
||||
; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60
|
||||
; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 32
|
||||
; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70
|
||||
; GCN-SDAG-NEXT: v_mov_b64_e32 v[64:65], 16
|
||||
; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60
|
||||
; GCN-SDAG-NEXT: v_mov_b64_e32 v[66:67], 0
|
||||
; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 0x50
|
||||
; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 64
|
||||
|
||||
@ -11,7 +11,7 @@ body: |
|
||||
; CHECK-LABEL: name: mimg_nsa
|
||||
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
|
||||
; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) {
|
||||
; CHECK-NEXT: S_CLAUSE 1
|
||||
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
|
||||
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
|
||||
@ -29,7 +29,7 @@ body: |
|
||||
; CHECK-LABEL: name: mimg_nsa_mixed
|
||||
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
|
||||
; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) {
|
||||
; CHECK-NEXT: S_CLAUSE 2
|
||||
; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
|
||||
; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
|
||||
|
||||
@ -10,7 +10,7 @@ body: |
|
||||
; CHECK-LABEL: name: mimg
|
||||
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
|
||||
; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) {
|
||||
; CHECK-NEXT: S_CLAUSE 1
|
||||
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
|
||||
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
|
||||
@ -28,7 +28,7 @@ body: |
|
||||
; CHECK-LABEL: name: mimg_mixed
|
||||
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 {
|
||||
; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) {
|
||||
; CHECK-NEXT: S_CLAUSE 2
|
||||
; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
|
||||
; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
|
||||
|
||||
@ -1,13 +1,20 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -stop-after=postrapseudos -o - < %s | FileCheck -enable-var-scope -check-prefix=MIR %s
|
||||
|
||||
|
||||
; MIR-LABEL: name: gws_barrier_offset0{{$}}
|
||||
; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
|
||||
; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
|
||||
; MIR-NEXT: S_WAITCNT 0
|
||||
; MIR-NEXT: }
|
||||
define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
|
||||
; MIR-LABEL: name: gws_barrier_offset0
|
||||
; MIR: bb.0 (%ir-block.0):
|
||||
; MIR-NEXT: liveins: $sgpr8_sgpr9
|
||||
; MIR-NEXT: {{ $}}
|
||||
; MIR-NEXT: renamable $sgpr4 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset, align 16, addrspace 4)
|
||||
; MIR-NEXT: $m0 = S_MOV_B32 0
|
||||
; MIR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec
|
||||
; MIR-NEXT: BUNDLE implicit killed renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
|
||||
; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
|
||||
; MIR-NEXT: S_WAITCNT 0
|
||||
; MIR-NEXT: }
|
||||
; MIR-NEXT: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
|
||||
ret void
|
||||
}
|
||||
@ -17,5 +24,3 @@ declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { convergent inaccessiblememonly nounwind }
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; MIR: {{.*}}
|
||||
|
||||
@ -35,7 +35,7 @@
|
||||
; LOOP-NEXT: s_cbranch_scc1 [[LOOP]]
|
||||
|
||||
; MIR-LABEL: name: gws_barrier_offset0{{$}}
|
||||
; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
|
||||
; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec
|
||||
; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
|
||||
; MIR-NEXT: S_WAITCNT 0
|
||||
; MIR-NEXT: }
|
||||
|
||||
@ -13,9 +13,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
|
||||
; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
||||
; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
|
||||
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
|
||||
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
|
||||
; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
|
||||
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; SDAG-GFX11-TRUE16-NEXT: v_dot2_bf16_bf16 v0.l, s2, s3, v0.l
|
||||
; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
|
||||
@ -26,9 +26,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
|
||||
; SDAG-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
||||
; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
|
||||
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
|
||||
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
|
||||
; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
|
||||
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; SDAG-GFX11-FAKE16-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1
|
||||
; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
|
||||
|
||||
@ -12,9 +12,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
|
||||
; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
||||
; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
|
||||
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
|
||||
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
|
||||
; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
|
||||
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; SDAG-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, s2, s3, v0.l
|
||||
; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
|
||||
@ -25,9 +25,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
|
||||
; SDAG-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
||||
; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
|
||||
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
|
||||
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
|
||||
; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
|
||||
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; SDAG-GFX11-FAKE16-NEXT: v_dot2_f16_f16 v1, s2, s3, v1
|
||||
; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
|
||||
@ -38,9 +38,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
|
||||
; GISEL-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
||||
; GISEL-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GISEL-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
|
||||
; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
|
||||
; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
|
||||
; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
|
||||
; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GISEL-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, s2, s3, v0.l
|
||||
; GISEL-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
|
||||
@ -51,9 +51,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
|
||||
; GISEL-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
||||
; GISEL-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GISEL-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
|
||||
; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
|
||||
; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
|
||||
; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
|
||||
; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GISEL-GFX11-FAKE16-NEXT: v_dot2_f16_f16 v1, s2, s3, v1
|
||||
; GISEL-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
|
||||
|
||||
@ -17,21 +17,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s16
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, 0
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
@ -43,13 +41,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
|
||||
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
||||
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
|
||||
@ -175,16 +172,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
||||
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
||||
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
|
||||
@ -207,16 +203,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
|
||||
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
||||
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
||||
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
||||
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
||||
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
|
||||
@ -520,21 +515,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
|
||||
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
||||
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s16
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
@ -634,16 +627,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
||||
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
||||
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
||||
; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
||||
; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
||||
; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
||||
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
||||
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; GCN-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
|
||||
@ -802,11 +794,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s10
|
||||
@ -815,7 +807,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
||||
@ -833,12 +824,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
|
||||
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
||||
@ -965,15 +955,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
|
||||
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
||||
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
||||
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
||||
@ -1003,15 +992,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
|
||||
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
||||
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
||||
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
||||
@ -1317,11 +1305,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s10
|
||||
@ -1330,7 +1318,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
||||
@ -1348,12 +1335,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
|
||||
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
||||
@ -1481,11 +1467,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s10
|
||||
@ -1494,7 +1480,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
||||
@ -1512,12 +1497,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
|
||||
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
||||
@ -1645,11 +1629,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s10
|
||||
@ -1658,7 +1642,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
||||
@ -1676,12 +1659,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
|
||||
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
||||
@ -1809,11 +1791,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
|
||||
; SDAG-NEXT: v_mov_b32_e32 v16, 0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v13, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v14, s10
|
||||
@ -1822,7 +1804,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s15
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s2
|
||||
@ -1840,12 +1821,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
|
||||
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
|
||||
@ -1972,15 +1952,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
|
||||
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
||||
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
||||
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
||||
@ -2010,15 +1989,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
|
||||
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
||||
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
||||
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
||||
@ -2323,15 +2301,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
|
||||
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
||||
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
||||
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
||||
@ -2361,15 +2338,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
|
||||
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
||||
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
||||
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
||||
@ -2674,15 +2650,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
|
||||
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
||||
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
||||
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
||||
@ -2712,15 +2687,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
|
||||
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
||||
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
||||
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
||||
@ -3025,15 +2999,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
|
||||
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
|
||||
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
|
||||
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
|
||||
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v24, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v25, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, s10
|
||||
@ -3063,15 +3036,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
|
||||
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
|
||||
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
|
||||
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
|
||||
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
|
||||
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
|
||||
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
|
||||
|
||||
@ -10386,7 +10386,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
|
||||
; GFX8-NEXT: s_add_u32 s2, s0, 0x150
|
||||
; GFX8-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15]
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7]
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11]
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v13, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v12, s2
|
||||
; GFX8-NEXT: s_add_u32 s2, s0, 0x140
|
||||
@ -10395,10 +10396,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
|
||||
; GFX8-NEXT: v_mov_b32_e32 v14, s2
|
||||
; GFX8-NEXT: s_add_u32 s2, s0, 0x130
|
||||
; GFX8-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3]
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11]
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v17, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v16, s2
|
||||
; GFX8-NEXT: s_add_u32 s2, s0, 0x120
|
||||
@ -10406,20 +10403,21 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
|
||||
; GFX8-NEXT: v_mov_b32_e32 v19, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v18, s2
|
||||
; GFX8-NEXT: s_add_u32 s2, s0, 0x110
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s7
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7]
|
||||
; GFX8-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo
|
||||
; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, s5
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, s12
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, s12
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s11
|
||||
; GFX8-NEXT: v_mov_b32_e32 v9, s13
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v10, s14
|
||||
; GFX8-NEXT: v_mov_b32_e32 v11, s15
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[56:57], v[28:31]
|
||||
@ -10588,6 +10586,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23]
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27]
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
|
||||
@ -4582,18 +4582,18 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v3
|
||||
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v2
|
||||
; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 0, 16
|
||||
; GCN-HSA-NEXT: v_bfe_i32 v20, v2, 0, 16
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(11)
|
||||
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9
|
||||
|
||||
@ -3313,12 +3313,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
|
||||
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90
|
||||
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
|
||||
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
|
||||
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
|
||||
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
|
||||
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
|
||||
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
|
||||
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7
|
||||
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6
|
||||
; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v6
|
||||
@ -3726,7 +3726,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
|
||||
; GCN-GFX900-HSA-NEXT: s_nop 0
|
||||
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224
|
||||
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240
|
||||
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
|
||||
; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 ; 4-byte Folded Reload
|
||||
; GCN-GFX900-HSA-NEXT: s_nop 0
|
||||
; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
|
||||
@ -3740,7 +3739,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
|
||||
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v26
|
||||
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v27
|
||||
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v28
|
||||
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(12)
|
||||
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(11)
|
||||
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3
|
||||
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2
|
||||
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1
|
||||
@ -3749,6 +3748,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
|
||||
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v1
|
||||
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v2
|
||||
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v3
|
||||
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
|
||||
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
|
||||
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24
|
||||
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23
|
||||
@ -3758,7 +3758,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
|
||||
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v21
|
||||
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v21
|
||||
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v22
|
||||
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(1)
|
||||
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208
|
||||
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160
|
||||
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:176
|
||||
|
||||
@ -7788,19 +7788,18 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
|
||||
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v13, 16, 8
|
||||
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v35, v14, 8, 8
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xff, v14
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
|
||||
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v16, v17, 8, 8
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xff, v17
|
||||
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 24, v17
|
||||
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v52, v17, 16, 8
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v53
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v53
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v53
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v53
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:128
|
||||
@ -7810,7 +7809,7 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16
|
||||
; GCN-NOHSA-VI-NEXT: s_nop 0
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v53
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; GCN-NOHSA-VI-NEXT: s_endpgm
|
||||
|
||||
@ -3172,27 +3172,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32:
|
||||
; VI-NO-DS128: ; %bb.0:
|
||||
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
||||
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
||||
; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
||||
; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
||||
; VI-NO-DS128-NEXT: s_mov_b32 s90, -1
|
||||
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s1
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v16 offset1:1
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:2 offset1:3
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
|
||||
; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000
|
||||
; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11
|
||||
; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
|
||||
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
|
||||
; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
|
||||
; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12
|
||||
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v18
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12
|
||||
@ -3200,7 +3198,6 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v20
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v18
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v17
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v20
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v19
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v19
|
||||
@ -3243,17 +3240,19 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v56, 16, v19
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v55, 0xffff, v19
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
|
||||
; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
|
||||
; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
|
||||
; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v20
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v20
|
||||
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22
|
||||
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v19
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v19
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v18
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v18
|
||||
; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, s0
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v21
|
||||
; VI-NO-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v21
|
||||
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v60, 16, v24
|
||||
@ -3296,21 +3295,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v56, s1
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v56 offset1:1
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v56 offset0:2 offset1:3
|
||||
; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
|
||||
; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
|
||||
; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
|
||||
; GFX9-NO-DS128-NEXT: s_nop 0
|
||||
; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7
|
||||
; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
|
||||
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12
|
||||
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v15
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12
|
||||
@ -3337,9 +3332,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v22
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11
|
||||
; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
|
||||
; GFX9-NO-DS128-NEXT: s_nop 0
|
||||
; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25
|
||||
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v17
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v17
|
||||
@ -3360,16 +3357,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v55, 16, v22
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v54, 0xffff, v22
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v24
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v24
|
||||
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17
|
||||
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v63, 16, v23
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v62, 0xffff, v23
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v22
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v22
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16
|
||||
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16
|
||||
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v19
|
||||
@ -3806,9 +3804,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; VI-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16
|
||||
; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000
|
||||
; VI-DS128-NEXT: s_add_u32 s88, s88, s11
|
||||
; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
|
||||
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11
|
||||
; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10
|
||||
; VI-DS128-NEXT: v_mov_b32_e32 v4, v3
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11
|
||||
@ -3825,23 +3825,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
|
||||
; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
|
||||
; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
|
||||
; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21
|
||||
@ -3850,21 +3843,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27
|
||||
; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24
|
||||
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
|
||||
; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
|
||||
; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
|
||||
@ -3875,16 +3872,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
|
||||
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
|
||||
; VI-DS128-NEXT: v_mov_b32_e32 v24, s0
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
|
||||
; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58
|
||||
; VI-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57
|
||||
@ -3943,9 +3941,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v0
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16
|
||||
; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11
|
||||
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3)
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11
|
||||
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10
|
||||
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11
|
||||
@ -3964,24 +3964,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: s_nop 0
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21
|
||||
@ -3990,21 +3982,26 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: s_nop 0
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
|
||||
@ -4015,16 +4012,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
|
||||
; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
|
||||
; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58
|
||||
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57
|
||||
@ -4197,29 +4195,20 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
|
||||
; VI-NO-DS128: ; %bb.0:
|
||||
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
||||
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
||||
; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
||||
; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
||||
; VI-NO-DS128-NEXT: s_mov_b32 s90, -1
|
||||
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NO-DS128-NEXT: v_mov_b32_e32 v28, s1
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
|
||||
; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000
|
||||
; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11
|
||||
; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
|
||||
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
|
||||
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
|
||||
; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
|
||||
; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
|
||||
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30
|
||||
; VI-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29
|
||||
@ -4229,7 +4218,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31
|
||||
; VI-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
|
||||
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34
|
||||
; VI-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33
|
||||
@ -4247,16 +4236,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31
|
||||
; VI-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16
|
||||
; VI-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
|
||||
; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
|
||||
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
|
||||
; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
|
||||
; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32
|
||||
; VI-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
|
||||
; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
|
||||
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31
|
||||
; VI-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30
|
||||
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16
|
||||
; VI-NO-DS128-NEXT: v_mov_b32_e32 v30, s0
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
|
||||
; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
|
||||
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16
|
||||
; VI-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16
|
||||
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14
|
||||
@ -4316,23 +4313,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000
|
||||
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v28, s1
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
|
||||
; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
|
||||
; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
|
||||
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
|
||||
; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
|
||||
; GFX9-NO-DS128-NEXT: s_nop 0
|
||||
; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
|
||||
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30
|
||||
; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29
|
||||
@ -4342,7 +4330,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31
|
||||
; GFX9-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
|
||||
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34
|
||||
; GFX9-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33
|
||||
@ -4360,16 +4348,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31
|
||||
; GFX9-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16
|
||||
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
|
||||
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
|
||||
; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
|
||||
; GFX9-NO-DS128-NEXT: s_nop 0
|
||||
; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32
|
||||
; GFX9-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
|
||||
; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
|
||||
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31
|
||||
; GFX9-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30
|
||||
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16
|
||||
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v30, s0
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
|
||||
; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
|
||||
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16
|
||||
; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16
|
||||
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14
|
||||
@ -4857,10 +4853,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; VI-DS128-NEXT: v_mov_b32_e32 v32, s1
|
||||
; VI-DS128-NEXT: ds_read_b128 v[8:11], v32
|
||||
; VI-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16
|
||||
; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
|
||||
; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
|
||||
; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000
|
||||
; VI-DS128-NEXT: s_add_u32 s88, s88, s11
|
||||
; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10
|
||||
; VI-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16
|
||||
@ -4873,12 +4871,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8
|
||||
; VI-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16
|
||||
; VI-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16
|
||||
; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
|
||||
; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18
|
||||
@ -4899,8 +4891,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64
|
||||
; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80
|
||||
; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96
|
||||
; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
|
||||
@ -4913,14 +4908,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
|
||||
; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
|
||||
; VI-DS128-NEXT: v_mov_b32_e32 v32, s0
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
|
||||
; VI-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16
|
||||
; VI-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
|
||||
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
|
||||
; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
|
||||
; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34
|
||||
; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33
|
||||
; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16
|
||||
@ -4985,9 +4981,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s1
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v32
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
|
||||
; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11
|
||||
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3)
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10
|
||||
; GFX9-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16
|
||||
@ -5001,13 +4999,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8
|
||||
; GFX9-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16
|
||||
; GFX9-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: s_nop 0
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18
|
||||
@ -5028,8 +5019,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: s_nop 0
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
|
||||
@ -5042,14 +5037,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
|
||||
; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
|
||||
; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
|
||||
; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
|
||||
; GFX9-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16
|
||||
; GFX9-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
|
||||
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
|
||||
; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
|
||||
; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34
|
||||
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33
|
||||
; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16
|
||||
|
||||
@ -15,24 +15,23 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr
|
||||
; GFX12-NEXT: s_mov_b32 s9, s12
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: s_mov_b32 s8, s1
|
||||
; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
|
||||
; GFX12-NEXT: s_mov_b32 s13, s2
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
|
||||
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
|
||||
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
|
||||
; GFX12-NEXT: s_mov_b32 s5, s12
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_mov_b32 s4, s3
|
||||
; GFX12-NEXT: s_mov_b32 s3, s12
|
||||
; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
|
||||
; GFX12-NEXT: s_mov_b32 s13, s2
|
||||
; GFX12-NEXT: s_mov_b32 s2, s1
|
||||
; GFX12-NEXT: s_mov_b32 s3, s12
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
@ -63,10 +62,10 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr
|
||||
; GFX12-NEXT: s_mov_b32 s13, s2
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
|
||||
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
|
||||
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
|
||||
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_mov_b32 s5, s12
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s3
|
||||
@ -100,25 +99,24 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
|
||||
; GFX12-NEXT: s_mov_b32 s9, s12
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: s_mov_b32 s8, s1
|
||||
; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
|
||||
; GFX12-NEXT: s_mov_b32 s13, s2
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
|
||||
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
|
||||
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
|
||||
; GFX12-NEXT: s_mov_b32 s5, s12
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s3
|
||||
; GFX12-NEXT: s_mov_b32 s3, s12
|
||||
; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
|
||||
; GFX12-NEXT: s_mov_b32 s13, s2
|
||||
; GFX12-NEXT: s_mov_b32 s2, s1
|
||||
; GFX12-NEXT: s_mov_b32 s3, s12
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
|
||||
; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen
|
||||
@ -141,24 +139,23 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7)
|
||||
; GFX12-NEXT: s_mov_b32 s9, s12
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_mov_b32 s6, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: s_mov_b32 s8, s1
|
||||
; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
|
||||
; GFX12-NEXT: s_mov_b32 s13, s2
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
|
||||
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
|
||||
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
|
||||
; GFX12-NEXT: s_mov_b32 s5, s12
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s3
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_mov_b32 s4, s3
|
||||
; GFX12-NEXT: s_mov_b32 s3, s12
|
||||
; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
|
||||
; GFX12-NEXT: s_mov_b32 s13, s2
|
||||
; GFX12-NEXT: s_mov_b32 s2, s1
|
||||
; GFX12-NEXT: s_mov_b32 s3, s12
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
|
||||
@ -128,10 +128,10 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
|
||||
; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
|
||||
; GFX10-SDAG-NEXT: s_mov_b32 s11, s2
|
||||
; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11]
|
||||
; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc
|
||||
; GFX10-SDAG-NEXT: s_clause 0x1
|
||||
; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
|
||||
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
|
||||
; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc
|
||||
; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-SDAG-NEXT: s_mov_b32 s5, s10
|
||||
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -181,24 +181,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s9, s12
|
||||
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s6, s3
|
||||
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s8, s1
|
||||
; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
|
||||
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
|
||||
; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
|
||||
; GFX11-SDAG-NEXT: s_clause 0x1
|
||||
; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
|
||||
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s5, s12
|
||||
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
|
||||
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
|
||||
; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s2, s1
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
|
||||
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
|
||||
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -215,12 +214,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
|
||||
; GFX11-GISEL-NEXT: s_mov_b32 s8, s1
|
||||
; GFX11-GISEL-NEXT: s_mov_b32 s9, s2
|
||||
; GFX11-GISEL-NEXT: s_mov_b32 s10, s3
|
||||
; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
|
||||
; GFX11-GISEL-NEXT: s_clause 0x1
|
||||
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
|
||||
; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
|
||||
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
|
||||
; GFX11-GISEL-NEXT: s_mov_b32 s4, s1
|
||||
; GFX11-GISEL-NEXT: s_mov_b32 s5, s2
|
||||
; GFX11-GISEL-NEXT: s_mov_b32 s6, s3
|
||||
@ -239,24 +238,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s9, s12
|
||||
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s6, s3
|
||||
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s8, s1
|
||||
; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
|
||||
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
|
||||
; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
|
||||
; GFX12-SDAG-NEXT: s_clause 0x1
|
||||
; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
|
||||
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s5, s12
|
||||
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
|
||||
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
|
||||
; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s2, s1
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
|
||||
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
|
||||
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
|
||||
@ -273,12 +271,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
|
||||
; GFX12-GISEL-NEXT: s_mov_b32 s8, s1
|
||||
; GFX12-GISEL-NEXT: s_mov_b32 s9, s2
|
||||
; GFX12-GISEL-NEXT: s_mov_b32 s10, s3
|
||||
; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
|
||||
; GFX12-GISEL-NEXT: s_clause 0x1
|
||||
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
|
||||
; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
|
||||
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
|
||||
; GFX12-GISEL-NEXT: s_mov_b32 s4, s1
|
||||
; GFX12-GISEL-NEXT: s_mov_b32 s5, s2
|
||||
; GFX12-GISEL-NEXT: s_mov_b32 s6, s3
|
||||
@ -413,11 +411,11 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
|
||||
; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
|
||||
; GFX10-SDAG-NEXT: s_mov_b32 s11, s2
|
||||
; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11]
|
||||
; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
|
||||
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-SDAG-NEXT: s_clause 0x1
|
||||
; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
|
||||
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
|
||||
; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
|
||||
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-SDAG-NEXT: s_mov_b32 s5, s10
|
||||
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -468,25 +466,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s9, s12
|
||||
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s6, s3
|
||||
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s8, s1
|
||||
; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
|
||||
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
|
||||
; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
|
||||
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-SDAG-NEXT: s_clause 0x1
|
||||
; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
|
||||
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s5, s12
|
||||
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
|
||||
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
|
||||
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
|
||||
; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s2, s1
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
|
||||
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
|
||||
; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
|
||||
@ -503,13 +500,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
|
||||
; GFX11-GISEL-NEXT: s_mov_b32 s8, s1
|
||||
; GFX11-GISEL-NEXT: s_mov_b32 s9, s2
|
||||
; GFX11-GISEL-NEXT: s_mov_b32 s10, s3
|
||||
; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
|
||||
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-GISEL-NEXT: s_clause 0x1
|
||||
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
|
||||
; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
|
||||
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
|
||||
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-GISEL-NEXT: s_mov_b32 s4, s1
|
||||
; GFX11-GISEL-NEXT: s_mov_b32 s5, s2
|
||||
; GFX11-GISEL-NEXT: s_mov_b32 s6, s3
|
||||
@ -528,25 +525,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s9, s12
|
||||
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s6, s3
|
||||
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s8, s1
|
||||
; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
|
||||
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
|
||||
; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
|
||||
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_clause 0x1
|
||||
; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
|
||||
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s5, s12
|
||||
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
|
||||
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
|
||||
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
|
||||
; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s2, s1
|
||||
; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
|
||||
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
|
||||
; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
|
||||
@ -563,13 +559,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
|
||||
; GFX12-GISEL-NEXT: s_mov_b32 s8, s1
|
||||
; GFX12-GISEL-NEXT: s_mov_b32 s9, s2
|
||||
; GFX12-GISEL-NEXT: s_mov_b32 s10, s3
|
||||
; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
|
||||
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_clause 0x1
|
||||
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
|
||||
; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
|
||||
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
|
||||
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_mov_b32 s4, s1
|
||||
; GFX12-GISEL-NEXT: s_mov_b32 s5, s2
|
||||
; GFX12-GISEL-NEXT: s_mov_b32 s6, s3
|
||||
|
||||
@ -774,9 +774,9 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp
|
||||
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
|
||||
; GFX1250-NEXT: s_load_b32 s6, s[0:1], 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
|
||||
; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_max_u32_e32 v0, s6, v0
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -90,19 +90,19 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh
|
||||
; GFX12-GISEL-NEXT: s_load_b256 s[20:27], s[2:3], 0x40
|
||||
; GFX12-GISEL-NEXT: s_load_b512 s[36:51], s[2:3], 0x0
|
||||
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX12-GISEL-NEXT: buffer_load_b32 v2, off, s[16:19], null
|
||||
; GFX12-GISEL-NEXT: buffer_load_b32 v3, off, s[20:23], null
|
||||
; GFX12-GISEL-NEXT: buffer_load_b32 v4, off, s[40:43], null
|
||||
; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX12-GISEL-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x2
|
||||
; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2
|
||||
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1
|
||||
; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
|
||||
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x1
|
||||
; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s1, 0xac0, v3
|
||||
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4
|
||||
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1
|
||||
; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
|
||||
; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
|
||||
|
||||
@ -185,44 +185,47 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
|
||||
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
|
||||
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
|
||||
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
|
||||
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
|
||||
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
|
||||
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
|
||||
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
|
||||
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
|
||||
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
|
||||
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX900-NEXT: v_add_f32_e32 v4, s43, v4
|
||||
; GFX900-NEXT: v_add_f32_e32 v3, s42, v3
|
||||
; GFX900-NEXT: v_add_f32_e32 v2, s41, v2
|
||||
; GFX900-NEXT: v_add_f32_e32 v1, s40, v1
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX900-NEXT: v_add_f32_e32 v8, s39, v8
|
||||
; GFX900-NEXT: v_add_f32_e32 v7, s38, v7
|
||||
; GFX900-NEXT: v_add_f32_e32 v6, s37, v6
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: v_add_f32_e32 v32, s19, v32
|
||||
; GFX900-NEXT: v_add_f32_e32 v31, s18, v31
|
||||
; GFX900-NEXT: v_add_f32_e32 v30, s17, v30
|
||||
; GFX900-NEXT: v_add_f32_e32 v29, s16, v29
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX900-NEXT: v_add_f32_e32 v8, s39, v8
|
||||
; GFX900-NEXT: v_add_f32_e32 v7, s38, v7
|
||||
; GFX900-NEXT: v_add_f32_e32 v6, s37, v6
|
||||
; GFX900-NEXT: v_add_f32_e32 v5, s36, v5
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX900-NEXT: v_add_f32_e32 v12, s51, v12
|
||||
; GFX900-NEXT: v_add_f32_e32 v11, s50, v11
|
||||
; GFX900-NEXT: v_add_f32_e32 v10, s49, v10
|
||||
; GFX900-NEXT: v_add_f32_e32 v9, s48, v9
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX900-NEXT: v_add_f32_e32 v16, s47, v16
|
||||
; GFX900-NEXT: v_add_f32_e32 v15, s46, v15
|
||||
; GFX900-NEXT: v_add_f32_e32 v14, s45, v14
|
||||
; GFX900-NEXT: v_add_f32_e32 v13, s44, v13
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX900-NEXT: v_add_f32_e32 v20, s15, v20
|
||||
; GFX900-NEXT: v_add_f32_e32 v19, s14, v19
|
||||
; GFX900-NEXT: v_add_f32_e32 v18, s13, v18
|
||||
; GFX900-NEXT: v_add_f32_e32 v17, s12, v17
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: v_add_f32_e32 v24, s11, v24
|
||||
; GFX900-NEXT: v_add_f32_e32 v23, s10, v23
|
||||
; GFX900-NEXT: v_add_f32_e32 v22, s9, v22
|
||||
@ -246,6 +249,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
|
||||
@ -255,9 +260,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
||||
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
|
||||
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[40:41]
|
||||
; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[42:43]
|
||||
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
|
||||
@ -293,6 +296,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
|
||||
@ -302,9 +307,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
||||
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
|
||||
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[36:37]
|
||||
; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[38:39]
|
||||
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
|
||||
@ -340,11 +343,14 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
;
|
||||
; GFX1250-SDAG-LABEL: fadd_v32_vs:
|
||||
; GFX1250-SDAG: ; %bb.0:
|
||||
; GFX1250-SDAG-NEXT: s_clause 0x2
|
||||
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
||||
; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
|
||||
; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
|
||||
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
|
||||
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
|
||||
; GFX1250-SDAG-NEXT: s_clause 0x7
|
||||
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
|
||||
; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
|
||||
@ -354,22 +360,18 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
|
||||
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
|
||||
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
|
||||
; GFX1250-SDAG-NEXT: s_clause 0x1
|
||||
; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
|
||||
; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
|
||||
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
|
||||
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46
|
||||
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
|
||||
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
|
||||
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
|
||||
@ -409,6 +411,9 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; GFX1250-GISEL: ; %bb.0:
|
||||
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
||||
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GFX1250-GISEL-NEXT: s_clause 0x1
|
||||
; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
|
||||
; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
|
||||
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
|
||||
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
@ -421,10 +426,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
|
||||
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
|
||||
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
|
||||
; GFX1250-GISEL-NEXT: s_clause 0x1
|
||||
; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
|
||||
; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
|
||||
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
|
||||
@ -1442,44 +1443,47 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
|
||||
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
|
||||
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
|
||||
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
|
||||
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
|
||||
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
|
||||
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
|
||||
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
|
||||
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
|
||||
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
|
||||
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4
|
||||
; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3
|
||||
; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2
|
||||
; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8
|
||||
; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7
|
||||
; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32
|
||||
; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31
|
||||
; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30
|
||||
; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8
|
||||
; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7
|
||||
; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6
|
||||
; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12
|
||||
; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11
|
||||
; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10
|
||||
; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16
|
||||
; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15
|
||||
; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14
|
||||
; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20
|
||||
; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19
|
||||
; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18
|
||||
; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24
|
||||
; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23
|
||||
; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22
|
||||
@ -1503,6 +1507,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
|
||||
@ -1512,9 +1518,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
||||
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
|
||||
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[40:41]
|
||||
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[42:43]
|
||||
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
|
||||
@ -1550,6 +1554,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
|
||||
@ -1559,9 +1565,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
||||
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
|
||||
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[36:37]
|
||||
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[38:39]
|
||||
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
|
||||
@ -1597,11 +1601,14 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
;
|
||||
; GFX1250-SDAG-LABEL: fmul_v32_vs:
|
||||
; GFX1250-SDAG: ; %bb.0:
|
||||
; GFX1250-SDAG-NEXT: s_clause 0x2
|
||||
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
||||
; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
|
||||
; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
|
||||
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
|
||||
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
|
||||
; GFX1250-SDAG-NEXT: s_clause 0x7
|
||||
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
|
||||
; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
|
||||
@ -1611,22 +1618,18 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
|
||||
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
|
||||
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
|
||||
; GFX1250-SDAG-NEXT: s_clause 0x1
|
||||
; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
|
||||
; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
|
||||
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
|
||||
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46
|
||||
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
|
||||
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
|
||||
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
|
||||
@ -1666,6 +1669,9 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; GFX1250-GISEL: ; %bb.0:
|
||||
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
||||
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GFX1250-GISEL-NEXT: s_clause 0x1
|
||||
; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
|
||||
; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
|
||||
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
|
||||
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
@ -1678,10 +1684,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
|
||||
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
|
||||
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
|
||||
; GFX1250-GISEL-NEXT: s_clause 0x1
|
||||
; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
|
||||
; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
|
||||
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
|
||||
@ -2273,44 +2275,47 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
|
||||
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
|
||||
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
|
||||
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
|
||||
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
|
||||
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
|
||||
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
|
||||
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
|
||||
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
|
||||
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
|
||||
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43
|
||||
; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42
|
||||
; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41
|
||||
; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39
|
||||
; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38
|
||||
; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19
|
||||
; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18
|
||||
; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17
|
||||
; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39
|
||||
; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38
|
||||
; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37
|
||||
; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51
|
||||
; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50
|
||||
; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49
|
||||
; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47
|
||||
; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46
|
||||
; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45
|
||||
; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15
|
||||
; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14
|
||||
; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13
|
||||
; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11
|
||||
; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10
|
||||
; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9
|
||||
@ -2334,6 +2339,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
|
||||
@ -2343,9 +2350,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
|
||||
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
||||
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
|
||||
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41]
|
||||
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43]
|
||||
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
|
||||
@ -2381,6 +2386,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
|
||||
@ -2390,9 +2397,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
|
||||
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
||||
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
||||
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
||||
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
|
||||
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37]
|
||||
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39]
|
||||
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
|
||||
@ -2430,6 +2435,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; GFX1250-SDAG: ; %bb.0:
|
||||
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
||||
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GFX1250-SDAG-NEXT: s_clause 0x1
|
||||
; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
|
||||
; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
|
||||
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
|
||||
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
@ -2442,10 +2450,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
|
||||
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
|
||||
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
|
||||
; GFX1250-SDAG-NEXT: s_clause 0x1
|
||||
; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
|
||||
; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
|
||||
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[40:41]
|
||||
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[42:43]
|
||||
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[50:51]
|
||||
@ -2496,6 +2500,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; GFX1250-GISEL: ; %bb.0:
|
||||
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
||||
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GFX1250-GISEL-NEXT: s_clause 0x1
|
||||
; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
|
||||
; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
|
||||
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
|
||||
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
@ -2508,10 +2515,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
||||
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
|
||||
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
|
||||
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
|
||||
; GFX1250-GISEL-NEXT: s_clause 0x1
|
||||
; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
|
||||
; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
|
||||
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
|
||||
|
||||
@ -56,11 +56,11 @@ body: |
|
||||
; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
||||
; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
||||
; GCN-NEXT: }
|
||||
; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
|
||||
; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (load (s32)) {
|
||||
; GCN-NEXT: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
|
||||
; GCN-NEXT: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
|
||||
; GCN-NEXT: }
|
||||
; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
|
||||
; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (store (s128)) {
|
||||
; GCN-NEXT: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128))
|
||||
; GCN-NEXT: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128))
|
||||
; GCN-NEXT: }
|
||||
@ -359,6 +359,7 @@ tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
; GCN-LABLE: name: no_sched_barrier_within_bundle
|
||||
; GCN-LABEL: name: no_sched_barrier_within_bundle
|
||||
; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
|
||||
; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
|
||||
; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
|
||||
|
||||
@ -9,7 +9,7 @@ body: |
|
||||
; GFX12-LABEL: name: post_bundle_vimage
|
||||
; GFX12: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
|
||||
; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) {
|
||||
; GFX12-NEXT: $vgpr5 = IMAGE_LOAD_V1_V1_gfx12 $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
|
||||
; GFX12-NEXT: $vgpr4 = IMAGE_LOAD_V1_V1_gfx12 killed $vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
|
||||
; GFX12-NEXT: }
|
||||
@ -25,7 +25,7 @@ body: |
|
||||
; GFX12-LABEL: name: post_bundle_vsample
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 {
|
||||
; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 :: (dereferenceable load (s128), addrspace 8) {
|
||||
; GFX12-NEXT: $vgpr6_vgpr7_vgpr8_vgpr9 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
|
||||
; GFX12-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr2, killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
|
||||
; GFX12-NEXT: }
|
||||
|
||||
@ -398,11 +398,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
|
||||
; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[4:5]
|
||||
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2
|
||||
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc
|
||||
; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
|
||||
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2
|
||||
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffd800, v2
|
||||
; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v3, vcc
|
||||
; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
|
||||
; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe000, v2
|
||||
; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v3, vcc
|
||||
; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5]
|
||||
@ -514,10 +514,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
|
||||
; GFX900-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v2
|
||||
; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc
|
||||
; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
|
||||
; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
|
||||
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
|
||||
; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off
|
||||
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
|
||||
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
|
||||
; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
|
||||
; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off
|
||||
@ -526,13 +524,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
|
||||
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s3, v2
|
||||
; GFX900-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
|
||||
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
|
||||
; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
|
||||
; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
|
||||
; GFX900-NEXT: s_addk_i32 s5, 0x2000
|
||||
; GFX900-NEXT: s_cmp_gt_u32 s5, 0x3fffff
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX900-NEXT: v_add_co_u32_e32 v22, vcc, v8, v4
|
||||
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
|
||||
; GFX900-NEXT: global_load_dwordx2 v[8:9], v[14:15], off offset:-4096
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX900-NEXT: v_add_co_u32_e64 v24, s[0:1], v18, v22
|
||||
; GFX900-NEXT: v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1]
|
||||
; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
|
||||
@ -540,13 +540,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
|
||||
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s4, v2
|
||||
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
|
||||
; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, v20, v24
|
||||
; GFX900-NEXT: global_load_dwordx2 v[14:15], v[2:3], off
|
||||
; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, v21, v25, vcc
|
||||
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2
|
||||
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v16, v20
|
||||
; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v21, vcc
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(4)
|
||||
@ -734,10 +734,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
|
||||
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[14:15], off
|
||||
@ -753,39 +751,42 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[22:23], off offset:-2048
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
|
||||
; GFX90A-NEXT: s_addk_i32 s3, 0x2000
|
||||
; GFX90A-NEXT: s_cmp_gt_u32 s3, 0x3fffff
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(10)
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(9)
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v24, v4
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v25, v5, vcc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v26, v4
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v27, v5, vcc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
|
||||
; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -3,9 +3,6 @@
|
||||
define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) {
|
||||
; CHECK-LABEL: excess_soft_clause_reg_pressure:
|
||||
; CHECK: BB0_1: ; %for.cond28.preheader
|
||||
; CHECK: s_load_dwordx16
|
||||
; CHECK-NEXT: s_load_dwordx16
|
||||
|
||||
; CHECK: global_load_dword
|
||||
; CHECK-NEXT: global_load_dword
|
||||
; CHECK-NEXT: global_load_dword
|
||||
@ -18,11 +15,23 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspa
|
||||
; CHECK-NOT: v_readlane_b32
|
||||
|
||||
; CHECK: s_load_dwordx16
|
||||
; CHECK: s_load_dwordx16
|
||||
; CHECK: s_load_dwordx16
|
||||
; CHECK-NEXT: s_load_dwordx16
|
||||
|
||||
; CHECK-NOT: v_writelane_b32
|
||||
; CHECK-NOT: v_readlane_b32
|
||||
|
||||
; CHECK: s_load_dwordx16
|
||||
; CHECK-NEXT: s_load_dwordx16
|
||||
|
||||
; CHECK-NOT: v_writelane_b32
|
||||
; CHECK-NOT: v_readlane_b32
|
||||
|
||||
; CHECK: s_load_dwordx16
|
||||
; CHECK-NEXT: s_load_dwordx16
|
||||
|
||||
; CHECK-NOT: v_writelane_b32
|
||||
; CHECK-NOT: v_readlane_b32
|
||||
|
||||
entry:
|
||||
%i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
%i2 = load i64, ptr addrspace(4) %i, align 8
|
||||
|
||||
@ -448,13 +448,13 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3]
|
||||
; GFX90A-NEXT: s_nop 4
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
|
||||
; GFX90A-NEXT: buffer_load_dword v2, off, s[8:11], 0 ; 4-byte Folded Reload
|
||||
; GFX90A-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
|
||||
; GFX90A-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
|
||||
; GFX90A-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX90A-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: ;;#ASMSTART
|
||||
|
||||
@ -10314,7 +10314,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
|
||||
; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
|
||||
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050
|
||||
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16
|
||||
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
|
||||
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:224
|
||||
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040
|
||||
@ -10327,12 +10328,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:192
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:176
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:160
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
|
||||
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
|
||||
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
|
||||
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
|
||||
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:128
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:112
|
||||
@ -10344,7 +10343,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
|
||||
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96
|
||||
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0
|
||||
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
|
||||
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:80
|
||||
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0
|
||||
@ -10358,10 +10359,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
|
||||
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080
|
||||
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
|
||||
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060
|
||||
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
|
||||
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39]
|
||||
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1
|
||||
@ -10468,13 +10466,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
|
||||
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:224
|
||||
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
|
||||
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
|
||||
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
|
||||
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
|
||||
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:208
|
||||
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:192
|
||||
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[15:18], s[36:37] offset:176
|
||||
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
|
||||
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
|
||||
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:160
|
||||
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
|
||||
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010
|
||||
|
||||
@ -295,9 +295,9 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[18:19]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s16, 2
|
||||
; GCN-NEXT: v_mov_b32_e32 v32, 0
|
||||
; GCN-NEXT: v_writelane_b32 v40, s34, 3
|
||||
; GCN-NEXT: s_mov_b32 s34, s32
|
||||
; GCN-NEXT: v_mov_b32_e32 v32, 0
|
||||
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:1024
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34
|
||||
|
||||
@ -98,28 +98,29 @@ body: |
|
||||
|
||||
; CHECK-LABEL: name: foo
|
||||
; CHECK: liveins: $q0, $r0, $r1, $r2, $lr
|
||||
; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
|
||||
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
|
||||
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
|
||||
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
|
||||
; CHECK: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
|
||||
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7
|
||||
; CHECK: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2)
|
||||
; CHECK: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1)
|
||||
; CHECK: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0)
|
||||
; CHECK: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr {
|
||||
; CHECK: MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr
|
||||
; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4)
|
||||
; CHECK: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4)
|
||||
; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4)
|
||||
; CHECK: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4)
|
||||
; CHECK: }
|
||||
; CHECK: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 {
|
||||
; CHECK: MVE_VPST 4, implicit $vpr
|
||||
; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4)
|
||||
; CHECK: MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4)
|
||||
; CHECK: }
|
||||
; CHECK: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
|
||||
; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8
|
||||
; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $lr, -4
|
||||
; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $r7, -8
|
||||
; CHECK-NEXT: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
|
||||
; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $r7
|
||||
; CHECK-NEXT: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2)
|
||||
; CHECK-NEXT: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1)
|
||||
; CHECK-NEXT: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0)
|
||||
; CHECK-NEXT: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr :: (load (s128) from %ir.src, align 4), (store (s128) into %ir.dest, align 4), (load (s128) from %ir.src2, align 4), (store (s128) into %ir.dest2, align 4) {
|
||||
; CHECK-NEXT: MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr
|
||||
; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4)
|
||||
; CHECK-NEXT: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4)
|
||||
; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4)
|
||||
; CHECK-NEXT: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4)
|
||||
; CHECK-NEXT: }
|
||||
; CHECK-NEXT: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 :: (load (s128) from %ir.src3, align 4), (store (s128) into %ir.dest3, align 4) {
|
||||
; CHECK-NEXT: MVE_VPST 4, implicit $vpr
|
||||
; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4)
|
||||
; CHECK-NEXT: MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4)
|
||||
; CHECK-NEXT: }
|
||||
; CHECK-NEXT: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0
|
||||
$sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r7, killed $lr
|
||||
frame-setup CFI_INSTRUCTION def_cfa_offset 8
|
||||
frame-setup CFI_INSTRUCTION offset $lr, -4
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user