Revert "[RegAlloc] Relax the split constrain on MBB prolog" (#169990)

Reverts llvm/llvm-project#168259

breaks hip buildot
This commit is contained in:
theRonShark 2025-11-29 08:01:23 -05:00 committed by GitHub
parent d3762edd5f
commit 3a1079fa25
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 3458 additions and 3733 deletions

View File

@ -774,7 +774,8 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
// Abort if the spill cannot be inserted at the MBB' start
if (((BC.Entry == SpillPlacement::MustSpill) ||
(BC.Entry == SpillPlacement::PrefSpill)) &&
!SA->canSplitBeforeProlog(BC.Number))
SlotIndex::isEarlierInstr(BI.FirstInstr,
SA->getFirstSplitPoint(BC.Number)))
return false;
}
@ -829,7 +830,11 @@ bool RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
BCS[B].Number = Number;
// Abort if the spill cannot be inserted at the MBB' start
if (!SA->canSplitBeforeProlog(Number))
MachineBasicBlock *MBB = MF->getBlockNumbered(Number);
auto FirstNonDebugInstr = MBB->getFirstNonDebugInstr();
if (FirstNonDebugInstr != MBB->end() &&
SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*FirstNonDebugInstr),
SA->getFirstSplitPoint(Number)))
return false;
// Interference for the live-in value.
if (Intf.first() <= Indexes->getMBBStartIdx(Number))

View File

@ -147,54 +147,6 @@ InsertPointAnalysis::getLastInsertPointIter(const LiveInterval &CurLI,
return LIS.getInstructionFromIndex(LIP);
}
bool InsertPointAnalysis::canSplitBeforeProlog(const LiveInterval &CurLI,
const MachineBasicBlock &MBB) {
const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
for (auto &MI : MBB) {
if (MI.isPHI() || MI.isPosition() || MI.isDebugInstr() ||
MI.isPseudoProbe())
continue;
if (!TII->isBasicBlockPrologue(MI))
return true;
for (auto &MO : MI.operands()) {
if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
continue;
// For the AMDGPU target if a MBB contains exec mask restore preamble,
// SplitEditor may get state when it cannot insert a spill instruction
// at the begin of the MBB.
// E.g. for a MIR
// bb.100:
// %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc,
// implicit $exec
// ...
// use %1
// If the regalloc try to allocate a virtreg to the physreg already
// assigned to virtreg %1 and the pyhsreg is computed as the best
// candidate for split, it may insert COPY instruction.
// bb.100:
// %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc,
// implicit $exec
// %2 = COPY %orig
// ...
// use %1
// Thus %1 and %orig still have interference. We may add cost for the
// physreg candidate or abandon the candidate.
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
const TargetRegisterClass *CurRC = MRI.getRegClass(CurLI.reg());
if (TRI->getCommonSubClass(RC, CurRC))
return false;
}
}
return true;
}
//===----------------------------------------------------------------------===//
// Split Analysis
//===----------------------------------------------------------------------===//

View File

@ -89,9 +89,6 @@ public:
return Res;
}
/// Return true if we can split \pCurLI before \pMBB's prolog.
bool canSplitBeforeProlog(const LiveInterval &CurLI,
const MachineBasicBlock &MBB);
};
/// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
@ -250,11 +247,6 @@ public:
SlotIndex getFirstSplitPoint(unsigned Num) {
return IPA.getFirstInsertPoint(*MF.getBlockNumbered(Num));
}
bool canSplitBeforeProlog(unsigned Num) {
MachineBasicBlock *BB = MF.getBlockNumbered(Num);
return IPA.canSplitBeforeProlog(*CurLI, *BB);
}
};
/// SplitEditor - Edit machine code and LiveIntervals for live range

File diff suppressed because it is too large Load Diff

View File

@ -47208,32 +47208,33 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
; SI-NEXT: v_cvt_f16_f32_e32 v43, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v4
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v9
; SI-NEXT: v_cvt_f16_f32_e32 v4, v5
; SI-NEXT: v_cvt_f16_f32_e32 v47, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v7
; SI-NEXT: v_cvt_f16_f32_e32 v2, v8
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v13
; SI-NEXT: v_cvt_f16_f32_e32 v2, v8
; SI-NEXT: v_cvt_f16_f32_e32 v1, v11
; SI-NEXT: v_cvt_f16_f32_e32 v46, v10
; SI-NEXT: v_cvt_f16_f32_e32 v5, v11
; SI-NEXT: v_cvt_f16_f32_e32 v8, v12
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v45, v14
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v15
; SI-NEXT: v_cvt_f16_f32_e32 v1, v13
; SI-NEXT: v_cvt_f16_f32_e32 v12, v16
; SI-NEXT: v_cvt_f16_f32_e32 v13, v17
; SI-NEXT: v_cvt_f16_f32_e32 v44, v18
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v15
; SI-NEXT: v_cvt_f16_f32_e32 v10, v19
; SI-NEXT: v_cvt_f16_f32_e32 v17, v20
; SI-NEXT: v_cvt_f16_f32_e32 v11, v21
; SI-NEXT: v_cvt_f16_f32_e32 v14, v22
; SI-NEXT: v_cvt_f16_f32_e32 v43, v22
; SI-NEXT: v_cvt_f16_f32_e32 v19, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v24
; SI-NEXT: v_cvt_f16_f32_e32 v9, v25
@ -47242,44 +47243,44 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v18, v28
; SI-NEXT: v_cvt_f16_f32_e32 v7, v29
; SI-NEXT: v_cvt_f16_f32_e32 v25, v30
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v22, v40
; SI-NEXT: v_cvt_f16_f32_e32 v20, v57
; SI-NEXT: v_cvt_f16_f32_e32 v57, v58
; SI-NEXT: v_cvt_f16_f32_e32 v40, v59
; SI-NEXT: v_cvt_f16_f32_e32 v15, v60
; SI-NEXT: v_cvt_f16_f32_e32 v61, v61
; SI-NEXT: v_cvt_f16_f32_e32 v14, v61
; SI-NEXT: v_cvt_f16_f32_e32 v62, v62
; SI-NEXT: v_cvt_f16_f32_e32 v30, v63
; SI-NEXT: v_cvt_f16_f32_e32 v16, v33
; SI-NEXT: v_cvt_f16_f32_e32 v63, v35
; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
; SI-NEXT: v_cvt_f16_f32_e32 v61, v49
; SI-NEXT: v_cvt_f16_f32_e32 v29, v50
; SI-NEXT: v_cvt_f16_f32_e32 v35, v53
; SI-NEXT: v_cvt_f16_f32_e32 v33, v55
; SI-NEXT: v_cvt_f16_f32_e32 v58, v31
; SI-NEXT: v_cvt_f16_f32_e32 v49, v31
; SI-NEXT: v_cvt_f16_f32_e32 v28, v32
; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f16_f32_e32 v50, v34
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v36
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_cvt_f16_f32_e32 v60, v37
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_cvt_f16_f32_e32 v27, v38
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_cvt_f16_f32_e32 v53, v39
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v48
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cvt_f16_f32_e32 v39, v51
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_cvt_f16_f32_e32 v26, v52
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v34, v54
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v41
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@ -47289,34 +47290,47 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_mov_b32_e32 v54, v9
; SI-NEXT: v_mov_b32_e32 v55, v11
; SI-NEXT: v_mov_b32_e32 v41, v13
; SI-NEXT: v_mov_b32_e32 v48, v4
; SI-NEXT: v_mov_b32_e32 v4, v3
; SI-NEXT: v_mov_b32_e32 v3, v43
; SI-NEXT: v_mov_b32_e32 v48, v5
; SI-NEXT: s_xor_b64 exec, exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB58_2
; SI-NEXT: ; %bb.1: ; %cmp.true
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v37, v56
; SI-NEXT: v_mov_b32_e32 v7, v39
; SI-NEXT: v_cvt_f32_f16_e32 v39, v47
; SI-NEXT: v_cvt_f32_f16_e32 v48, v48
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v37
; SI-NEXT: v_cvt_f16_f32_e32 v37, v39
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: v_cvt_f32_f16_e32 v11, v33
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v21, v25
; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
; SI-NEXT: v_cvt_f32_f16_e32 v13, v35
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v33, v11
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
@ -47335,7 +47349,6 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
@ -47354,50 +47367,41 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v38, v5
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v38, v7
; SI-NEXT: v_mov_b32_e32 v7, v39
; SI-NEXT: v_cvt_f32_f16_e32 v39, v47
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v37
; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38
; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v37, v39
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48
; SI-NEXT: v_or_b32_e32 v9, v38, v47
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v5, v38, v47
; SI-NEXT: v_cvt_f32_f16_e32 v38, v46
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v37
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v37, v9
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38
; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
; SI-NEXT: v_or_b32_e32 v48, v39, v46
; SI-NEXT: v_cvt_f32_f16_e32 v39, v45
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v38
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v39
; SI-NEXT: v_cvt_f32_f16_e32 v39, v41
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v37, v9
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v38, v9
; SI-NEXT: v_or_b32_e32 v9, v37, v45
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
@ -47408,37 +47412,35 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_or_b32_e32 v9, v38, v57
; SI-NEXT: v_cvt_f32_f16_e32 v38, v14
; SI-NEXT: v_cvt_f32_f16_e32 v38, v43
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v37
; SI-NEXT: v_cvt_f32_f16_e32 v37, v55
; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38
; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, v58
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v21, v25
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v38
; SI-NEXT: v_cvt_f32_f16_e32 v38, v54
; SI-NEXT: v_or_b32_e32 v41, v39, v43
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v38
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v38
; SI-NEXT: v_cvt_f32_f16_e32 v38, v54
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v9, v63
; SI-NEXT: v_or_b32_e32 v55, v37, v42
; SI-NEXT: v_cvt_f32_f16_e32 v37, v52
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v38
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: v_or_b32_e32 v54, v25, v58
; SI-NEXT: v_cvt_f32_f16_e32 v25, v40
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v21, v51
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_or_b32_e32 v52, v37, v40
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_or_b32_e32 v52, v37, v40
; SI-NEXT: v_cvt_f16_f32_e32 v37, v30
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v25
; SI-NEXT: v_cvt_f32_f16_e32 v25, v62
@ -47449,22 +47451,22 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_cvt_f32_f16_e32 v29, v49
; SI-NEXT: v_cvt_f32_f16_e32 v29, v61
; SI-NEXT: v_or_b32_e32 v62, v25, v59
; SI-NEXT: v_cvt_f32_f16_e32 v25, v28
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v21, v9
; SI-NEXT: v_cvt_f32_f16_e32 v21, v49
; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_or_b32_e32 v49, v29, v28
; SI-NEXT: v_or_b32_e32 v61, v29, v28
; SI-NEXT: v_cvt_f16_f32_e32 v29, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v25
; SI-NEXT: v_cvt_f32_f16_e32 v25, v60
; SI-NEXT: v_or_b32_e32 v38, v21, v27
; SI-NEXT: v_or_b32_e32 v49, v21, v27
; SI-NEXT: v_cvt_f32_f16_e32 v21, v26
; SI-NEXT: v_cvt_f32_f16_e32 v26, v7
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
@ -47478,7 +47480,6 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v25, v32
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v21, v34
; SI-NEXT: v_or_b32_e32 v41, v39, v43
; SI-NEXT: v_or_b32_e32 v39, v29, v26
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
; SI-NEXT: v_cvt_f32_f16_e32 v29, v31
@ -47492,19 +47493,14 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v34, v21, v25
; SI-NEXT: v_cvt_f32_f16_e32 v25, v36
; SI-NEXT: v_cvt_f32_f16_e32 v29, v50
; SI-NEXT: v_cvt_f32_f16_e32 v9, v63
; SI-NEXT: v_cvt_f32_f16_e32 v14, v61
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
; SI-NEXT: v_cvt_f16_f32_e32 v36, v25
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v63, v9
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v31
; SI-NEXT: v_cvt_f16_f32_e32 v61, v14
; SI-NEXT: v_or_b32_e32 v53, v7, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v36
; SI-NEXT: v_or_b32_e32 v50, v25, v21
@ -47512,17 +47508,18 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v35, v13, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63
; SI-NEXT: v_or_b32_e32 v16, v16, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v14
; SI-NEXT: v_or_b32_e32 v15, v15, v21
; SI-NEXT: v_cvt_f32_f16_e32 v21, v22
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20
; SI-NEXT: v_alignbit_b32 v29, v35, v28, 16
; SI-NEXT: v_alignbit_b32 v28, v50, v27, 16
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_alignbit_b32 v27, v53, v60, 16
; SI-NEXT: v_mov_b32_e32 v60, v37
; SI-NEXT: v_alignbit_b32 v26, v34, v26, 16
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_alignbit_b32 v28, v50, v27, 16
; SI-NEXT: v_or_b32_e32 v22, v21, v22
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v18
; SI-NEXT: v_or_b32_e32 v24, v24, v21
@ -47543,15 +47540,19 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_alignbit_b32 v46, v5, v45, 16
; SI-NEXT: v_alignbit_b32 v45, v1, v57, 16
; SI-NEXT: v_alignbit_b32 v44, v10, v43, 16
; SI-NEXT: v_alignbit_b32 v14, v19, v42, 16
; SI-NEXT: v_alignbit_b32 v43, v19, v42, 16
; SI-NEXT: v_alignbit_b32 v21, v24, v58, 16
; SI-NEXT: v_mov_b32_e32 v58, v38
; SI-NEXT: v_alignbit_b32 v25, v22, v40, 16
; SI-NEXT: v_alignbit_b32 v40, v15, v30, 16
; SI-NEXT: v_alignbit_b32 v30, v16, v59, 16
; SI-NEXT: v_alignbit_b32 v27, v53, v60, 16
; SI-NEXT: v_mov_b32_e32 v60, v37
; SI-NEXT: v_alignbit_b32 v26, v34, v26, 16
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: .LBB58_2: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v56
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
@ -47560,7 +47561,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v37, 0xffff, v7
; SI-NEXT: v_and_b32_e32 v37, 0xffff, v5
; SI-NEXT: v_or_b32_e32 v37, v37, v38
; SI-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen
; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
@ -47584,8 +47585,10 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0
; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
@ -47616,7 +47619,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@ -47658,7 +47661,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@ -47675,7 +47678,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
@ -47687,7 +47690,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0

View File

@ -51080,79 +51080,79 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v55, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v22
; SI-NEXT: v_cvt_f16_f32_e32 v40, v4
; SI-NEXT: v_cvt_f16_f32_e32 v52, v6
; SI-NEXT: v_cvt_f16_f32_e32 v60, v10
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
; SI-NEXT: v_cvt_f16_f32_e32 v6, v13
; SI-NEXT: v_cvt_f16_f32_e32 v37, v15
; SI-NEXT: v_cvt_f16_f32_e32 v2, v19
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
; SI-NEXT: v_cvt_f16_f32_e32 v55, v7
; SI-NEXT: v_cvt_f16_f32_e32 v53, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v9
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
; SI-NEXT: v_cvt_f16_f32_e32 v48, v11
; SI-NEXT: v_cvt_f16_f32_e32 v49, v12
; SI-NEXT: v_cvt_f16_f32_e32 v38, v16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v14
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v18
; SI-NEXT: v_cvt_f16_f32_e32 v16, v22
; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v53, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v10
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
; SI-NEXT: v_cvt_f16_f32_e32 v49, v12
; SI-NEXT: v_cvt_f16_f32_e32 v6, v13
; SI-NEXT: v_cvt_f16_f32_e32 v37, v15
; SI-NEXT: v_cvt_f16_f32_e32 v60, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v18
; SI-NEXT: v_cvt_f16_f32_e32 v52, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v9
; SI-NEXT: v_cvt_f16_f32_e32 v48, v11
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_cvt_f16_f32_e32 v38, v16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v19
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_cvt_f16_f32_e32 v11, v23
; SI-NEXT: v_cvt_f16_f32_e32 v5, v24
; SI-NEXT: v_cvt_f16_f32_e32 v18, v25
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_cvt_f16_f32_e32 v24, v26
; SI-NEXT: v_cvt_f16_f32_e32 v26, v27
; SI-NEXT: v_cvt_f16_f32_e32 v27, v28
; SI-NEXT: v_cvt_f16_f32_e32 v25, v29
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
; SI-NEXT: v_cvt_f16_f32_e32 v28, v30
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51
; SI-NEXT: v_cvt_f16_f32_e32 v22, v43
; SI-NEXT: v_cvt_f16_f32_e32 v23, v44
; SI-NEXT: v_cvt_f16_f32_e32 v29, v61
; SI-NEXT: v_cvt_f16_f32_e32 v61, v61
; SI-NEXT: v_cvt_f16_f32_e32 v44, v62
; SI-NEXT: v_cvt_f16_f32_e32 v62, v63
; SI-NEXT: v_cvt_f16_f32_e32 v18, v63
; SI-NEXT: v_cvt_f16_f32_e32 v19, v33
; SI-NEXT: v_cvt_f16_f32_e32 v61, v36
; SI-NEXT: v_cvt_f16_f32_e32 v33, v36
; SI-NEXT: v_cvt_f16_f32_e32 v43, v39
; SI-NEXT: v_cvt_f16_f32_e32 v15, v50
; SI-NEXT: v_cvt_f16_f32_e32 v63, v54
; SI-NEXT: v_cvt_f16_f32_e32 v16, v54
; SI-NEXT: v_cvt_f16_f32_e32 v54, v41
; SI-NEXT: v_cvt_f16_f32_e32 v51, v42
; SI-NEXT: v_cvt_f16_f32_e32 v14, v45
; SI-NEXT: v_cvt_f16_f32_e32 v13, v46
; SI-NEXT: v_cvt_f16_f32_e32 v12, v45
; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f16_f32_e32 v50, v47
; SI-NEXT: v_cvt_f16_f32_e32 v13, v46
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_cvt_f16_f32_e32 v30, v56
; SI-NEXT: v_cvt_f16_f32_e32 v50, v47
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_cvt_f16_f32_e32 v12, v57
; SI-NEXT: v_cvt_f16_f32_e32 v30, v56
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_cvt_f16_f32_e32 v7, v58
; SI-NEXT: v_cvt_f16_f32_e32 v63, v57
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v31
; SI-NEXT: v_cvt_f16_f32_e32 v62, v58
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cvt_f16_f32_e32 v45, v32
; SI-NEXT: v_cvt_f16_f32_e32 v36, v31
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v59
; SI-NEXT: v_cvt_f16_f32_e32 v45, v32
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v34
; SI-NEXT: v_cvt_f16_f32_e32 v31, v59
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cvt_f16_f32_e32 v33, v35
; SI-NEXT: v_cvt_f16_f32_e32 v32, v34
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cvt_f16_f32_e32 v3, v35
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100
@ -51177,21 +51177,11 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: v_mov_b32_e32 v58, v8
; SI-NEXT: v_mov_b32_e32 v8, v60
; SI-NEXT: v_mov_b32_e32 v46, v52
; SI-NEXT: v_mov_b32_e32 v52, v55
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v47, v21
; SI-NEXT: v_mov_b32_e32 v56, v17
; SI-NEXT: v_mov_b32_e32 v57, v6
; SI-NEXT: v_mov_b32_e32 v59, v61
; SI-NEXT: v_mov_b32_e32 v61, v29
; SI-NEXT: v_mov_b32_e32 v29, v25
; SI-NEXT: v_mov_b32_e32 v25, v18
; SI-NEXT: v_mov_b32_e32 v21, v16
; SI-NEXT: v_mov_b32_e32 v17, v1
; SI-NEXT: v_mov_b32_e32 v58, v7
; SI-NEXT: v_mov_b32_e32 v59, v33
; SI-NEXT: s_xor_b64 exec, exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB58_2
; SI-NEXT: ; %bb.1: ; %cmp.true
@ -51201,14 +51191,12 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_cvt_f32_f16_e32 v32, v32
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v7, v62
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f32_f16_e32 v6, v12
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_cvt_f32_f16_e32 v6, v63
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f32_f16_e32 v12, v14
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
@ -51218,36 +51206,35 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_cvt_f16_f32_e32 v62, v7
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_or_b32_e32 v3, v3, v34
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v10
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_or_b32_e32 v9, v9, v34
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32
; SI-NEXT: v_or_b32_e32 v1, v31, v34
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v7
; SI-NEXT: v_or_b32_e32 v6, v6, v34
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13
; SI-NEXT: v_or_b32_e32 v31, v12, v34
; SI-NEXT: v_mov_b32_e32 v12, v6
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v16, v63
; SI-NEXT: v_or_b32_e32 v31, v31, v34
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v62
; SI-NEXT: v_or_b32_e32 v63, v6, v34
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
; SI-NEXT: v_cvt_f32_f16_e32 v18, v62
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v63, v16
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
@ -51261,11 +51248,13 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v63
; SI-NEXT: v_or_b32_e32 v12, v12, v34
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
@ -51275,7 +51264,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_or_b32_e32 v62, v18, v34
; SI-NEXT: v_or_b32_e32 v18, v18, v34
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v22, v22, v34
@ -51287,9 +51276,8 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v2, v2, v34
; SI-NEXT: v_cvt_f32_f16_e32 v34, v38
; SI-NEXT: v_cvt_f32_f16_e32 v35, v37
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v46
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
; SI-NEXT: v_cvt_f16_f32_e32 v38, v34
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35
@ -51301,89 +51289,79 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v34, v49
; SI-NEXT: v_cvt_f32_f16_e32 v35, v48
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f32_f16_e32 v46, v58
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
; SI-NEXT: v_cvt_f16_f32_e32 v49, v34
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
; SI-NEXT: v_cvt_f32_f16_e32 v46, v58
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_or_b32_e32 v48, v34, v35
; SI-NEXT: v_cvt_f32_f16_e32 v34, v53
; SI-NEXT: v_cvt_f32_f16_e32 v35, v52
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
; SI-NEXT: v_cvt_f16_f32_e32 v53, v34
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53
; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_or_b32_e32 v52, v34, v35
; SI-NEXT: v_cvt_f32_f16_e32 v34, v40
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v35, v55
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
; SI-NEXT: v_cvt_f32_f16_e32 v43, v43
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
; SI-NEXT: v_cvt_f16_f32_e32 v40, v34
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40
; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43
; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
; SI-NEXT: v_or_b32_e32 v55, v34, v35
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v34, v60
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v35, v6
; SI-NEXT: v_cvt_f16_f32_e32 v43, v43
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_cvt_f32_f16_e32 v43, v43
; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_or_b32_e32 v6, v35, v34
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34
; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
; SI-NEXT: v_or_b32_e32 v6, v35, v34
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
; SI-NEXT: v_cvt_f32_f16_e32 v50, v50
; SI-NEXT: v_cvt_f32_f16_e32 v36, v36
; SI-NEXT: v_cvt_f32_f16_e32 v33, v33
; SI-NEXT: v_cvt_f32_f16_e32 v39, v39
; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42
; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28
; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43
; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51
; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42
; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29
; SI-NEXT: v_cvt_f16_f32_e32 v43, v43
; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50
; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36
; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33
; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
@ -51391,91 +51369,102 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
; SI-NEXT: v_cvt_f16_f32_e32 v36, v36
; SI-NEXT: v_cvt_f16_f32_e32 v33, v33
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41
; SI-NEXT: v_or_b32_e32 v25, v25, v24
; SI-NEXT: v_or_b32_e32 v29, v29, v28
; SI-NEXT: v_or_b32_e32 v54, v54, v51
; SI-NEXT: v_or_b32_e32 v50, v50, v30
; SI-NEXT: v_or_b32_e32 v33, v33, v42
; SI-NEXT: v_or_b32_e32 v39, v39, v41
; SI-NEXT: v_alignbit_b32 v60, v55, v34, 16
; SI-NEXT: v_alignbit_b32 v24, v26, v24, 16
; SI-NEXT: v_alignbit_b32 v28, v22, v28, 16
; SI-NEXT: v_alignbit_b32 v30, v12, v30, 16
; SI-NEXT: v_alignbit_b32 v42, v9, v42, 16
; SI-NEXT: v_alignbit_b32 v51, v12, v51, 16
; SI-NEXT: v_alignbit_b32 v30, v63, v30, 16
; SI-NEXT: v_alignbit_b32 v41, v3, v41, 16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v35, v6
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_or_b32_e32 v6, v35, v1
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46
; SI-NEXT: v_cvt_f32_f16_e32 v46, v57
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_alignbit_b32 v1, v52, v1, 16
; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46
; SI-NEXT: v_or_b32_e32 v58, v35, v8
; SI-NEXT: v_cvt_f32_f16_e32 v35, v56
; SI-NEXT: v_cvt_f16_f32_e32 v46, v46
; SI-NEXT: v_alignbit_b32 v8, v48, v8, 16
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_or_b32_e32 v56, v35, v17
; SI-NEXT: v_alignbit_b32 v17, v2, v17, 16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v14, v6
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_or_b32_e32 v57, v46, v14
; SI-NEXT: v_cvt_f32_f16_e32 v46, v47
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46
; SI-NEXT: v_cvt_f32_f16_e32 v46, v59
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_cvt_f32_f16_e32 v46, v47
; SI-NEXT: v_alignbit_b32 v14, v37, v14, 16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v17, v6
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; SI-NEXT: v_or_b32_e32 v56, v35, v17
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_cvt_f32_f16_e32 v46, v59
; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46
; SI-NEXT: v_cvt_f16_f32_e32 v46, v46
; SI-NEXT: v_or_b32_e32 v59, v46, v43
; SI-NEXT: v_alignbit_b32 v43, v15, v43, 16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v21, v6
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: v_or_b32_e32 v47, v35, v21
; SI-NEXT: v_cvt_f32_f16_e32 v35, v44
; SI-NEXT: v_cvt_f32_f16_e32 v44, v61
; SI-NEXT: v_or_b32_e32 v59, v46, v43
; SI-NEXT: v_alignbit_b32 v46, v52, v1, 16
; SI-NEXT: v_alignbit_b32 v1, v37, v14, 16
; SI-NEXT: v_mov_b32_e32 v14, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44
; SI-NEXT: v_cvt_f16_f32_e32 v44, v44
; SI-NEXT: v_alignbit_b32 v21, v11, v21, 16
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35
; SI-NEXT: v_alignbit_b32 v43, v15, v43, 16
; SI-NEXT: v_or_b32_e32 v61, v44, v35
; SI-NEXT: v_cvt_f32_f16_e32 v44, v45
; SI-NEXT: v_alignbit_b32 v51, v14, v51, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44
; SI-NEXT: v_cvt_f16_f32_e32 v44, v44
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v44
; SI-NEXT: v_or_b32_e32 v36, v36, v45
; SI-NEXT: v_alignbit_b32 v44, v62, v35, 16
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_alignbit_b32 v44, v18, v35, 16
; SI-NEXT: v_alignbit_b32 v45, v31, v45, 16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v33, v6
; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33
; SI-NEXT: v_cvt_f16_f32_e32 v33, v33
; SI-NEXT: v_or_b32_e32 v6, v33, v42
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v2, v17, 16
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v11, v21, 16
; SI-NEXT: v_alignbit_b32 v42, v9, v42, 16
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: .LBB58_2: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6
@ -51487,7 +51476,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v34, v34, v35
; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0
; SI-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6
; SI-NEXT: v_or_b32_e32 v1, v34, v1
@ -51510,11 +51499,9 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v8
; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14
; SI-NEXT: v_or_b32_e32 v1, v1, v8
; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen
@ -51524,9 +51511,11 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v8
; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6
; SI-NEXT: v_or_b32_e32 v1, v1, v8
; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen
@ -51536,9 +51525,11 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@ -51579,7 +51570,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
@ -51592,7 +51583,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@ -51603,7 +51594,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
@ -51615,8 +51606,8 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@ -51633,8 +51624,10 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen

View File

@ -42,11 +42,14 @@ body: |
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $sgpr18_sgpr19 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: renamable $sgpr20_sgpr21 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: renamable $sgpr22_sgpr23 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5)
; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.3, align 4, addrspace 5)
; CHECK-NEXT: renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: renamable $sgpr56 = S_MOV_B32 0
; CHECK-NEXT: renamable $sgpr24_sgpr25 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5)
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit $exec
; CHECK-NEXT: renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: renamable $sgpr57 = S_MOV_B32 1083786240
@ -55,7 +58,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.17(0x40000000)
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr100_sgpr101, implicit-def dead $scc
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY]]
@ -64,7 +67,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.5(0x40000000)
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
; CHECK-NEXT: renamable $sgpr40 = COPY renamable $sgpr72
@ -92,12 +95,12 @@ body: |
; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
; CHECK-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr56_sgpr57
; CHECK-NEXT: renamable $sgpr54 = COPY killed renamable $sgpr76
; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58 = COPY renamable $sgpr52_sgpr53_sgpr54
; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54 = COPY renamable $sgpr56_sgpr57_sgpr58
; CHECK-NEXT: renamable $sgpr55 = COPY killed renamable $sgpr76
; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
; CHECK-NEXT: renamable $sgpr48_sgpr49_sgpr50 = COPY renamable $sgpr52_sgpr53_sgpr54
; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54 = COPY renamable $sgpr48_sgpr49_sgpr50
; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
; CHECK-NEXT: renamable $sgpr55 = COPY killed renamable $sgpr68
; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
; CHECK-NEXT: renamable $sgpr56 = COPY killed renamable $sgpr72
; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
@ -162,22 +165,23 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.5:
; CHECK-NEXT: successors: %bb.12(0x40000000), %bb.6(0x40000000)
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 renamable $sgpr22_sgpr23, undef renamable $sgpr54_sgpr55, implicit-def dead $scc
; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.3, align 4, addrspace 5)
; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 killed renamable $sgpr12_sgpr13, undef renamable $sgpr54_sgpr55, implicit-def dead $scc
; CHECK-NEXT: renamable $sgpr54_sgpr55 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr12_sgpr13
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.6:
; CHECK-NEXT: successors: %bb.7(0x80000000)
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr34_sgpr35, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.7:
; CHECK-NEXT: successors: %bb.8(0x80000000)
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $sgpr64_sgpr65 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
; CHECK-NEXT: renamable $sgpr66_sgpr67 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
@ -185,14 +189,14 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.8:
; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.9(0x40000000)
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr64_sgpr65, implicit-def dead $scc
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.10, implicit $vcc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.9:
; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.17(0x40000000)
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr84_sgpr85, implicit $exec
@ -214,17 +218,11 @@ body: |
; CHECK-NEXT: renamable $sgpr83 = COPY killed renamable $sgpr15
; CHECK-NEXT: renamable $sgpr85 = COPY killed renamable $sgpr14
; CHECK-NEXT: renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr18_sgpr19
; CHECK-NEXT: renamable $sgpr50_sgpr51 = COPY killed renamable $sgpr20_sgpr21
; CHECK-NEXT: renamable $sgpr36_sgpr37 = COPY killed renamable $sgpr22_sgpr23
; CHECK-NEXT: renamable $sgpr38_sgpr39 = COPY killed renamable $sgpr24_sgpr25
; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83
; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9
; CHECK-NEXT: renamable $sgpr24_sgpr25 = COPY killed renamable $sgpr38_sgpr39
; CHECK-NEXT: renamable $sgpr22_sgpr23 = COPY killed renamable $sgpr36_sgpr37
; CHECK-NEXT: renamable $sgpr20_sgpr21 = COPY killed renamable $sgpr50_sgpr51
; CHECK-NEXT: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr48_sgpr49
; CHECK-NEXT: renamable $sgpr14 = COPY killed renamable $sgpr85
; CHECK-NEXT: renamable $sgpr15 = COPY killed renamable $sgpr83
@ -240,42 +238,44 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.10:
; CHECK-NEXT: successors: %bb.8(0x40000000), %bb.12(0x40000000)
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.12
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.11:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.17(0x40000000)
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.17
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.12:
; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.13(0x40000000)
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr54_sgpr55
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.11, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.13:
; CHECK-NEXT: successors: %bb.15(0x40000000), %bb.14(0x40000000)
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr24_sgpr25, implicit-def dead $scc
; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5)
; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.15, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.14
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.14:
; CHECK-NEXT: successors: %bb.15(0x80000000)
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.15:
; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.16(0x40000000)
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr20_sgpr21, implicit-def dead $scc
; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5)
; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.11, implicit $vcc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.16:

View File

@ -42,13 +42,11 @@ body: |
%24:sgpr_128 = COPY %1
%25:sgpr_128 = COPY %1
%26:sgpr_128 = COPY %1
%27:sgpr_128 = COPY %1
S_BRANCH %bb.1
bb.1:
liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr102_sgpr103
%27 = IMPLICIT_DEF implicit-def $exec
%0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr96_sgpr97, implicit-def $exec, implicit-def $scc, implicit $exec
$exec = S_XOR_B64_term $exec, %0, implicit-def $scc
S_CBRANCH_EXECZ %bb.3, implicit $exec
@ -57,7 +55,6 @@ body: |
bb.2:
liveins: $sgpr98_sgpr99, $sgpr102_sgpr103
%27 = IMPLICIT_DEF implicit-def $exec
%0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr98_sgpr99, implicit-def $exec, implicit-def $scc, implicit $exec
$exec = S_XOR_B64_term $exec, %0, implicit-def $scc
S_CBRANCH_EXECZ %bb.3, implicit $exec
@ -66,7 +63,6 @@ body: |
bb.3:
liveins: $sgpr102_sgpr103
%27 = IMPLICIT_DEF implicit-def $exec
%0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr102_sgpr103, implicit-def $exec, implicit-def $scc, implicit $exec
$exec = S_XOR_B64_term $exec, %0, implicit-def $scc
S_BRANCH %bb.4
@ -85,7 +81,6 @@ body: |
S_CMP_EQ_U64 %21.sub0_sub1, %22.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %23.sub0_sub1, %24.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %25.sub0_sub1, %26.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %26.sub0_sub1, %27.sub2_sub3, implicit-def $scc
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %0, implicit $vgpr0
...

View File

@ -1,167 +0,0 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
# RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -run-pass=greedy -o - %s | FileCheck %s
---
# Check that spill save/restore should be inserted after $exec mask is defined.
name: foo
tracksRegLiveness: true
machineFunctionInfo:
scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
stackPtrOffsetReg: $sgpr32
body: |
; CHECK-LABEL: name: foo
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr100_sgpr101, $sgpr102_sgpr103
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: dead [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr102_sgpr103
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.0, align 4, addrspace 5)
; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.1, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.1, align 4, addrspace 5)
; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.2, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.2, align 4, addrspace 5)
; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.3, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.3, align 4, addrspace 5)
; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.4, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.4, align 4, addrspace 5)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY19:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY20:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY21:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr102_sgpr103
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr96_sgpr97, implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
; CHECK-NEXT: liveins: $sgpr98_sgpr99, $sgpr102_sgpr103
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_1:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr98_sgpr99, implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_1]], implicit-def $scc
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.4
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x80000000)
; CHECK-NEXT: liveins: $sgpr102_sgpr103
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_1:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr102_sgpr103, implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_1]], implicit-def $scc
; CHECK-NEXT: S_BRANCH %bb.4
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4:
; CHECK-NEXT: $exec = IMPLICIT_DEF
; CHECK-NEXT: [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.0, align 4, addrspace 5)
; CHECK-NEXT: S_CMP_EQ_U64 [[COPY1]].sub0_sub1, [[SI_SPILL_S128_RESTORE]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: [[SI_SPILL_S128_RESTORE1:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.1, align 4, addrspace 5)
; CHECK-NEXT: [[SI_SPILL_S128_RESTORE2:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.2, align 4, addrspace 5)
; CHECK-NEXT: S_CMP_EQ_U64 [[SI_SPILL_S128_RESTORE1]].sub0_sub1, [[SI_SPILL_S128_RESTORE2]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: [[SI_SPILL_S128_RESTORE3:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.3, align 4, addrspace 5)
; CHECK-NEXT: [[SI_SPILL_S128_RESTORE4:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.4, align 4, addrspace 5)
; CHECK-NEXT: S_CMP_EQ_U64 [[SI_SPILL_S128_RESTORE3]].sub0_sub1, [[SI_SPILL_S128_RESTORE4]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: S_CMP_EQ_U64 [[COPY2]].sub0_sub1, [[COPY3]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: S_CMP_EQ_U64 [[COPY4]].sub0_sub1, [[COPY5]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: S_CMP_EQ_U64 [[COPY6]].sub0_sub1, [[COPY7]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: S_CMP_EQ_U64 [[COPY8]].sub0_sub1, [[COPY9]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: S_CMP_EQ_U64 [[COPY10]].sub0_sub1, [[COPY11]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: S_CMP_EQ_U64 [[COPY12]].sub0_sub1, [[COPY13]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: S_CMP_EQ_U64 [[COPY14]].sub0_sub1, [[COPY15]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: S_CMP_EQ_U64 [[COPY16]].sub0_sub1, [[COPY17]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: S_CMP_EQ_U64 [[COPY18]].sub0_sub1, [[COPY19]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: S_CMP_EQ_U64 [[COPY20]].sub0_sub1, [[COPY21]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit [[S_OR_SAVEEXEC_B64_1]], implicit $vgpr0
bb.0:
liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr100_sgpr101, $sgpr102_sgpr103
%0:sreg_64 = COPY $sgpr102_sgpr103
%1:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
%2:sgpr_128 = COPY %1
%3:sgpr_128 = COPY %1
%4:sgpr_128 = COPY %1
%5:sgpr_128 = COPY %1
%6:sgpr_128 = COPY %1
%7:sgpr_128 = COPY %1
%8:sgpr_128 = COPY %1
%9:sgpr_128 = COPY %1
%10:sgpr_128 = COPY %1
%11:sgpr_128 = COPY %1
%12:sgpr_128 = COPY %1
%13:sgpr_128 = COPY %1
%14:sgpr_128 = COPY %1
%15:sgpr_128 = COPY %1
%16:sgpr_128 = COPY %1
%17:sgpr_128 = COPY %1
%18:sgpr_128 = COPY %1
%19:sgpr_128 = COPY %1
%20:sgpr_128 = COPY %1
%21:sgpr_128 = COPY %1
%22:sgpr_128 = COPY %1
%23:sgpr_128 = COPY %1
%24:sgpr_128 = COPY %1
%25:sgpr_128 = COPY %1
%26:sgpr_128 = COPY %1
S_BRANCH %bb.1
bb.1:
liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr102_sgpr103
%0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr96_sgpr97, implicit-def $exec, implicit-def $scc, implicit $exec
$exec = S_XOR_B64_term $exec, %0, implicit-def $scc
S_CBRANCH_EXECZ %bb.3, implicit $exec
S_BRANCH %bb.2
bb.2:
liveins: $sgpr98_sgpr99, $sgpr102_sgpr103
%0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr98_sgpr99, implicit-def $exec, implicit-def $scc, implicit $exec
$exec = S_XOR_B64_term $exec, %0, implicit-def $scc
S_CBRANCH_EXECZ %bb.3, implicit $exec
S_BRANCH %bb.4
bb.3:
liveins: $sgpr102_sgpr103
%0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr102_sgpr103, implicit-def $exec, implicit-def $scc, implicit $exec
$exec = S_XOR_B64_term $exec, %0, implicit-def $scc
S_BRANCH %bb.4
bb.4:
$exec = IMPLICIT_DEF
S_CMP_EQ_U64 %1.sub0_sub1, %2.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %3.sub0_sub1, %4.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %5.sub0_sub1, %6.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %7.sub0_sub1, %8.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %9.sub0_sub1, %10.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %11.sub0_sub1, %12.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %13.sub0_sub1, %14.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %15.sub0_sub1, %16.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %17.sub0_sub1, %18.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %19.sub0_sub1, %20.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %21.sub0_sub1, %22.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %23.sub0_sub1, %24.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %25.sub0_sub1, %26.sub2_sub3, implicit-def $scc
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %0, implicit $vgpr0
...

View File

@ -9742,122 +9742,170 @@ entry:
define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX6-LABEL: test_limited_sgpr:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x9
; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0
; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0
; GFX6-NEXT: s_mov_b32 s18, 0
; GFX6-NEXT: v_mov_b32_e32 v6, 0
; GFX6-NEXT: s_mov_b32 s19, 0xf000
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[16:17], s[14:15]
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:240
; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s42, -1
; GFX6-NEXT: s_mov_b32 s43, 0xe8f000
; GFX6-NEXT: s_add_u32 s40, s40, s11
; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0
; GFX6-NEXT: s_addc_u32 s41, s41, 0
; GFX6-NEXT: s_mov_b32 s0, 0x85e00
; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[16:19], 0 addr64 offset:32
; GFX6-NEXT: buffer_load_dwordx4 v[16:19], v[5:6], s[16:19], 0 addr64 offset:48
; GFX6-NEXT: s_waitcnt vmcnt(2)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0
; GFX6-NEXT: v_mov_b32_e32 v6, 0
; GFX6-NEXT: s_mov_b64 s[4:5], exec
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_writelane_b32 v1, s0, 0
; GFX6-NEXT: v_writelane_b32 v1, s1, 1
; GFX6-NEXT: v_writelane_b32 v1, s2, 2
; GFX6-NEXT: v_writelane_b32 v1, s3, 3
; GFX6-NEXT: s_mov_b32 s8, 0x80400
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s8 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:224
; GFX6-NEXT: s_mov_b32 s0, 0x85a00
; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:208
; GFX6-NEXT: s_mov_b32 s0, 0x85600
; GFX6-NEXT: s_mov_b64 exec, s[4:5]
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240
; GFX6-NEXT: s_mov_b32 s2, 0x86a00
; GFX6-NEXT: s_mov_b64 s[8:9], exec
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:192
; GFX6-NEXT: s_mov_b32 s0, 0x85200
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224
; GFX6-NEXT: s_mov_b32 s2, 0x86600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:176
; GFX6-NEXT: s_mov_b32 s0, 0x84e00
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208
; GFX6-NEXT: s_mov_b32 s2, 0x86200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:160
; GFX6-NEXT: s_mov_b32 s0, 0x84a00
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192
; GFX6-NEXT: s_mov_b32 s2, 0x85e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:144
; GFX6-NEXT: s_mov_b32 s0, 0x84600
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176
; GFX6-NEXT: s_mov_b32 s2, 0x85a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:128
; GFX6-NEXT: s_mov_b32 s0, 0x84200
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160
; GFX6-NEXT: s_mov_b32 s2, 0x85600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:112
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144
; GFX6-NEXT: s_mov_b32 s2, 0x85200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128
; GFX6-NEXT: s_mov_b32 s2, 0x84e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112
; GFX6-NEXT: s_mov_b32 s2, 0x84a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96
; GFX6-NEXT: s_mov_b32 s2, 0x84600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80
; GFX6-NEXT: s_mov_b32 s2, 0x84200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64
; GFX6-NEXT: s_mov_b32 s2, 0x83a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64
; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16
; GFX6-NEXT: s_mov_b32 s2, 0x83200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:32
; GFX6-NEXT: s_mov_b32 s2, 0x83600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_writelane_b32 v4, s0, 0
; GFX6-NEXT: v_writelane_b32 v4, s1, 1
; GFX6-NEXT: v_writelane_b32 v4, s2, 2
; GFX6-NEXT: v_writelane_b32 v4, s3, 3
; GFX6-NEXT: s_mov_b32 s10, 0x80800
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[8:9]
; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48
; GFX6-NEXT: s_mov_b32 s0, 0x83e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:96
; GFX6-NEXT: s_mov_b32 s0, 0x83a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:80
; GFX6-NEXT: s_mov_b32 s0, 0x83600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_load_dwordx4 v[20:23], v[5:6], s[16:19], 0 addr64 offset:64
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64
; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[16:19], 0 addr64 offset:16
; GFX6-NEXT: s_mov_b32 s0, 0x83200
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4
; GFX6-NEXT: s_waitcnt expcnt(3)
; GFX6-NEXT: v_mov_b32_e32 v7, 1
; GFX6-NEXT: s_mov_b64 s[0:1], exec
@ -9876,77 +9924,23 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_writelane_b32 v4, s9, 5
; GFX6-NEXT: v_writelane_b32 v4, s10, 6
; GFX6-NEXT: v_writelane_b32 v4, s11, 7
; GFX6-NEXT: s_mov_b32 s2, 0x80400
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[0:1]
; GFX6-NEXT: s_mov_b64 s[0:1], exec
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ; def s[4:11]
; GFX6-NEXT: ;;#ASMEND
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_writelane_b32 v4, s4, 0
; GFX6-NEXT: v_writelane_b32 v4, s5, 1
; GFX6-NEXT: v_writelane_b32 v4, s6, 2
; GFX6-NEXT: v_writelane_b32 v4, s7, 3
; GFX6-NEXT: v_writelane_b32 v4, s8, 4
; GFX6-NEXT: v_writelane_b32 v4, s9, 5
; GFX6-NEXT: v_writelane_b32 v4, s10, 6
; GFX6-NEXT: v_writelane_b32 v4, s11, 7
; GFX6-NEXT: s_mov_b32 s2, 0x80c00
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[0:1]
; GFX6-NEXT: s_mov_b64 s[0:1], exec
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ; def s[4:11]
; GFX6-NEXT: ;;#ASMEND
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_writelane_b32 v4, s4, 0
; GFX6-NEXT: v_writelane_b32 v4, s5, 1
; GFX6-NEXT: v_writelane_b32 v4, s6, 2
; GFX6-NEXT: v_writelane_b32 v4, s7, 3
; GFX6-NEXT: v_writelane_b32 v4, s8, 4
; GFX6-NEXT: v_writelane_b32 v4, s9, 5
; GFX6-NEXT: v_writelane_b32 v4, s10, 6
; GFX6-NEXT: v_writelane_b32 v4, s11, 7
; GFX6-NEXT: s_mov_b32 s2, 0x81400
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[0:1]
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ; def s[0:7]
; GFX6-NEXT: ;;#ASMEND
; GFX6-NEXT: s_mov_b64 s[8:9], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_writelane_b32 v4, s0, 0
; GFX6-NEXT: v_writelane_b32 v4, s1, 1
; GFX6-NEXT: v_writelane_b32 v4, s2, 2
; GFX6-NEXT: v_writelane_b32 v4, s3, 3
; GFX6-NEXT: v_writelane_b32 v4, s4, 4
; GFX6-NEXT: v_writelane_b32 v4, s5, 5
; GFX6-NEXT: v_writelane_b32 v4, s6, 6
; GFX6-NEXT: v_writelane_b32 v4, s7, 7
; GFX6-NEXT: s_mov_b32 s10, 0x81c00
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[8:9]
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ; def s[8:15]
; GFX6-NEXT: ;;#ASMEND
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ; def s[16:23]
; GFX6-NEXT: ;;#ASMEND
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ; def s[24:31]
; GFX6-NEXT: ;;#ASMEND
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ; def s[0:3]
; GFX6-NEXT: ;;#ASMEND
; GFX6-NEXT: ;;#ASMSTART
@ -9956,28 +9950,33 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: ; def s33
; GFX6-NEXT: ;;#ASMEND
; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX6-NEXT: s_mov_b64 vcc, s[6:7]
; GFX6-NEXT: s_cbranch_execz .LBB1_2
; GFX6-NEXT: ; %bb.1: ; %bb0
; GFX6-NEXT: s_mov_b64 s[8:9], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_writelane_b32 v4, s12, 0
; GFX6-NEXT: v_writelane_b32 v4, s13, 1
; GFX6-NEXT: v_writelane_b32 v4, s14, 2
; GFX6-NEXT: v_writelane_b32 v4, s15, 3
; GFX6-NEXT: s_mov_b32 s10, 0x82400
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill
; GFX6-NEXT: v_writelane_b32 v4, s8, 0
; GFX6-NEXT: v_writelane_b32 v4, s9, 1
; GFX6-NEXT: v_writelane_b32 v4, s10, 2
; GFX6-NEXT: v_writelane_b32 v4, s11, 3
; GFX6-NEXT: v_writelane_b32 v4, s12, 4
; GFX6-NEXT: v_writelane_b32 v4, s13, 5
; GFX6-NEXT: v_writelane_b32 v4, s14, 6
; GFX6-NEXT: v_writelane_b32 v4, s15, 7
; GFX6-NEXT: s_mov_b32 s34, 0x81400
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[8:9]
; GFX6-NEXT: s_mov_b64 s[20:21], exec
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s22, 0x80400
; GFX6-NEXT: s_mov_b32 s34, 0x80c00
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s22 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readlane_b32 s8, v4, 0
; GFX6-NEXT: v_readlane_b32 s9, v4, 1
@ -9989,27 +9988,31 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_readlane_b32 s15, v4, 7
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[20:21]
; GFX6-NEXT: s_mov_b64 s[20:21], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_writelane_b32 v4, s16, 0
; GFX6-NEXT: v_writelane_b32 v4, s17, 1
; GFX6-NEXT: v_writelane_b32 v4, s18, 2
; GFX6-NEXT: v_writelane_b32 v4, s19, 3
; GFX6-NEXT: s_mov_b32 s22, 0x82c00
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s22 ; 4-byte Folded Spill
; GFX6-NEXT: v_writelane_b32 v4, s20, 4
; GFX6-NEXT: v_writelane_b32 v4, s21, 5
; GFX6-NEXT: v_writelane_b32 v4, s22, 6
; GFX6-NEXT: v_writelane_b32 v4, s23, 7
; GFX6-NEXT: s_mov_b32 s34, 0x81c00
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[20:21]
; GFX6-NEXT: s_mov_b64 s[24:25], exec
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s26, 0x80c00
; GFX6-NEXT: s_mov_b32 s34, 0x81400
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s26 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readlane_b32 s16, v4, 0
; GFX6-NEXT: v_readlane_b32 s17, v4, 1
@ -10021,13 +10024,31 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_readlane_b32 s23, v4, 7
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[24:25]
; GFX6-NEXT: s_mov_b64 s[34:35], exec
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s36, 0x81400
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
; GFX6-NEXT: v_writelane_b32 v4, s24, 0
; GFX6-NEXT: v_writelane_b32 v4, s25, 1
; GFX6-NEXT: v_writelane_b32 v4, s26, 2
; GFX6-NEXT: v_writelane_b32 v4, s27, 3
; GFX6-NEXT: v_writelane_b32 v4, s28, 4
; GFX6-NEXT: v_writelane_b32 v4, s29, 5
; GFX6-NEXT: v_writelane_b32 v4, s30, 6
; GFX6-NEXT: v_writelane_b32 v4, s31, 7
; GFX6-NEXT: s_mov_b32 s34, 0x82400
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s34, 0x81c00
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readlane_b32 s24, v4, 0
; GFX6-NEXT: v_readlane_b32 s25, v4, 1
@ -10039,8 +10060,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_readlane_b32 s31, v4, 7
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[34:35]
; GFX6-NEXT: s_mov_b64 s[34:35], exec
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
@ -10048,12 +10069,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_writelane_b32 v4, s1, 1
; GFX6-NEXT: v_writelane_b32 v4, s2, 2
; GFX6-NEXT: v_writelane_b32 v4, s3, 3
; GFX6-NEXT: s_mov_b32 s36, 0x82800
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill
; GFX6-NEXT: s_mov_b32 s34, 0x82c00
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[34:35]
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: s_mov_b64 s[0:1], exec
; GFX6-NEXT: s_mov_b64 exec, 3
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
@ -10066,11 +10087,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[0:1]
; GFX6-NEXT: s_mov_b64 vcc, s[6:7]
; GFX6-NEXT: s_mov_b64 s[34:35], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s36, 0x81c00
; GFX6-NEXT: s_mov_b32 s36, 0x82400
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
@ -10088,7 +10108,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 s[34:35], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s44, 0x82800
; GFX6-NEXT: s_mov_b32 s44, 0x82c00
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
@ -10114,79 +10134,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35]
; GFX6-NEXT: ;;#ASMEND
; GFX6-NEXT: s_mov_b64 s[6:7], vcc
; GFX6-NEXT: s_mov_b64 s[0:1], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s2, 0x82c00
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readlane_b32 s16, v4, 0
; GFX6-NEXT: v_readlane_b32 s17, v4, 1
; GFX6-NEXT: v_readlane_b32 s18, v4, 2
; GFX6-NEXT: v_readlane_b32 s19, v4, 3
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[0:1]
; GFX6-NEXT: s_mov_b64 s[0:1], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s2, 0x82400
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readlane_b32 s12, v4, 0
; GFX6-NEXT: v_readlane_b32 s13, v4, 1
; GFX6-NEXT: v_readlane_b32 s14, v4, 2
; GFX6-NEXT: v_readlane_b32 s15, v4, 3
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[0:1]
; GFX6-NEXT: s_mov_b32 s0, 0x86200
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_mov_b32 s0, 0x86600
; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_mov_b32 s0, 0x86a00
; GFX6-NEXT: s_waitcnt expcnt(4)
; GFX6-NEXT: v_mov_b32_e32 v0, v20
; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v19, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: v_mov_b32_e32 v1, v21
; GFX6-NEXT: v_mov_b32_e32 v2, v22
; GFX6-NEXT: v_mov_b32_e32 v3, v23
; GFX6-NEXT: s_waitcnt expcnt(3)
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ;;#ASMEND
; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt expcnt(2)
; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt expcnt(1)
; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x86600
; GFX6-NEXT: v_mov_b32_e32 v23, v3
; GFX6-NEXT: buffer_load_dword v12, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x86200
; GFX6-NEXT: v_mov_b32_e32 v22, v2
; GFX6-NEXT: v_mov_b32_e32 v21, v1
; GFX6-NEXT: v_mov_b32_e32 v20, v0
; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ;;#ASMEND
; GFX6-NEXT: ;;#ASMSTART
@ -10200,16 +10149,69 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ;;#ASMEND
; GFX6-NEXT: .LBB1_2: ; %ret
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_mov_b32 s0, 0x85e00
; GFX6-NEXT: s_or_b64 exec, exec, vcc
; GFX6-NEXT: s_mov_b64 s[4:5], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s6, 0x80400
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readlane_b32 s0, v4, 0
; GFX6-NEXT: v_readlane_b32 s1, v4, 1
; GFX6-NEXT: v_readlane_b32 s2, v4, 2
; GFX6-NEXT: v_readlane_b32 s3, v4, 3
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[4:5]
; GFX6-NEXT: s_mov_b64 s[36:37], s[0:1]
; GFX6-NEXT: s_mov_b64 s[4:5], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s6, 0x80800
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readlane_b32 s0, v4, 0
; GFX6-NEXT: v_readlane_b32 s1, v4, 1
; GFX6-NEXT: v_readlane_b32 s2, v4, 2
; GFX6-NEXT: v_readlane_b32 s3, v4, 3
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[4:5]
; GFX6-NEXT: s_mov_b32 s0, 0x86a00
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX6-NEXT: s_mov_b32 s0, 0x86600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x86200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x85e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b64 s[14:15], s[18:19]
; GFX6-NEXT: s_mov_b32 s0, 0x85a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:240
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@ -10217,7 +10219,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x85600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:224
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@ -10225,7 +10227,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x85200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:208
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@ -10233,7 +10235,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x84e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:192
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@ -10241,7 +10243,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x84a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:176
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@ -10249,7 +10251,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x84600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:160
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@ -10257,15 +10259,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x84200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:144
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x83e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:128
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@ -10273,7 +10267,15 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x83a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:112
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:80
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x83e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@ -10281,7 +10283,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x83600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:96
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:48
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@ -10289,18 +10291,15 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x83200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:80
; GFX6-NEXT: buffer_store_dwordx4 v[20:23], v[5:6], s[12:15], 0 addr64 offset:64
; GFX6-NEXT: buffer_store_dwordx4 v[16:19], v[5:6], s[12:15], 0 addr64 offset:48
; GFX6-NEXT: buffer_store_dwordx4 v[12:15], v[5:6], s[12:15], 0 addr64 offset:32
; GFX6-NEXT: s_waitcnt expcnt(3)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:32
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:16
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[12:15], 0 addr64
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:16
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64
; GFX6-NEXT: s_endpgm
;
; GFX9-FLATSCR-LABEL: test_limited_sgpr:

View File

@ -31,23 +31,22 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
; CHECK-NEXT: .cfi_offset %edi, -16
; CHECK-NEXT: .cfi_offset %ebx, -12
; CHECK-NEXT: .cfi_offset %ebp, -8
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
; CHECK-NEXT: testb $1, %bl
; CHECK-NEXT: je LBB0_7
; CHECK-NEXT: je LBB0_25
; CHECK-NEXT: ## %bb.1: ## %bb116.i
; CHECK-NEXT: je LBB0_7
; CHECK-NEXT: je LBB0_25
; CHECK-NEXT: ## %bb.2: ## %bb52.i.i
; CHECK-NEXT: je LBB0_7
; CHECK-NEXT: je LBB0_25
; CHECK-NEXT: ## %bb.3: ## %bb142.i
; CHECK-NEXT: je LBB0_7
; CHECK-NEXT: je LBB0_25
; CHECK-NEXT: ## %bb.4:
; CHECK-NEXT: movl %eax, %esi
; CHECK-NEXT: movl L_.str89$non_lazy_ptr, %edi
; CHECK-NEXT: movb $1, %bh
; CHECK-NEXT: movl L_.str$non_lazy_ptr, %ebp
; CHECK-NEXT: jmp LBB0_5
; CHECK-NEXT: LBB0_23: ## %bb7806
; CHECK-NEXT: LBB0_21: ## %bb7806
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: Ltmp16: ## EH_LABEL
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
@ -58,50 +57,50 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
; CHECK-NEXT: LBB0_5: ## %bb3261
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: cmpl $37, 0
; CHECK-NEXT: jne LBB0_6
; CHECK-NEXT: ## %bb.8: ## %bb3306
; CHECK-NEXT: jne LBB0_25
; CHECK-NEXT: ## %bb.6: ## %bb3306
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: Ltmp0: ## EH_LABEL
; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: calll __ZN12wxStringBaseaSEPKw
; CHECK-NEXT: Ltmp1: ## EH_LABEL
; CHECK-NEXT: ## %bb.9: ## %bb3314
; CHECK-NEXT: ## %bb.7: ## %bb3314
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: movl 0, %eax
; CHECK-NEXT: cmpl $121, %eax
; CHECK-NEXT: ja LBB0_6
; CHECK-NEXT: ## %bb.10: ## %bb3314
; CHECK-NEXT: ja LBB0_25
; CHECK-NEXT: ## %bb.8: ## %bb3314
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: jmpl *LJTI0_0(,%eax,4)
; CHECK-NEXT: LBB0_12: ## %bb5809
; CHECK-NEXT: LBB0_10: ## %bb5809
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne LBB0_6
; CHECK-NEXT: ## %bb.13: ## %bb5809
; CHECK-NEXT: jne LBB0_25
; CHECK-NEXT: ## %bb.11: ## %bb5809
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: testb %bh, %bh
; CHECK-NEXT: je LBB0_6
; CHECK-NEXT: ## %bb.14: ## %bb91.i8504
; CHECK-NEXT: je LBB0_25
; CHECK-NEXT: ## %bb.12: ## %bb91.i8504
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: testb $1, %bl
; CHECK-NEXT: je LBB0_16
; CHECK-NEXT: ## %bb.15: ## %bb155.i8541
; CHECK-NEXT: je LBB0_14
; CHECK-NEXT: ## %bb.13: ## %bb155.i8541
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: Ltmp4: ## EH_LABEL
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: calll _gmtime_r
; CHECK-NEXT: Ltmp5: ## EH_LABEL
; CHECK-NEXT: LBB0_16: ## %bb182.i8560
; CHECK-NEXT: LBB0_14: ## %bb182.i8560
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: testb $1, %bl
; CHECK-NEXT: je LBB0_17
; CHECK-NEXT: ## %bb.18: ## %bb278.i8617
; CHECK-NEXT: je LBB0_15
; CHECK-NEXT: ## %bb.16: ## %bb278.i8617
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: je LBB0_20
; CHECK-NEXT: ## %bb.19: ## %bb440.i8663
; CHECK-NEXT: je LBB0_18
; CHECK-NEXT: ## %bb.17: ## %bb440.i8663
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: Ltmp6: ## EH_LABEL
; CHECK-NEXT: movl L_.str4$non_lazy_ptr, %eax
@ -114,11 +113,11 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
; CHECK-NEXT: movl $1717, {{[0-9]+}}(%esp) ## imm = 0x6B5
; CHECK-NEXT: calll __Z10wxOnAssertPKwiPKcS0_S0_
; CHECK-NEXT: Ltmp7: ## EH_LABEL
; CHECK-NEXT: jmp LBB0_20
; CHECK-NEXT: LBB0_17: ## %bb187.i8591
; CHECK-NEXT: jmp LBB0_18
; CHECK-NEXT: LBB0_15: ## %bb187.i8591
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: jne LBB0_6
; CHECK-NEXT: LBB0_20: ## %invcont5814
; CHECK-NEXT: jne LBB0_25
; CHECK-NEXT: LBB0_18: ## %invcont5814
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: Ltmp8: ## EH_LABEL
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
@ -127,7 +126,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
; CHECK-NEXT: calll __ZN8wxString6FormatEPKwz
; CHECK-NEXT: subl $4, %esp
; CHECK-NEXT: Ltmp9: ## EH_LABEL
; CHECK-NEXT: ## %bb.21: ## %invcont5831
; CHECK-NEXT: ## %bb.19: ## %invcont5831
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: Ltmp10: ## EH_LABEL
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
@ -137,7 +136,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
; CHECK-NEXT: calll __ZN12wxStringBase10ConcatSelfEmPKwm
; CHECK-NEXT: Ltmp11: ## EH_LABEL
; CHECK-NEXT: jmp LBB0_5
; CHECK-NEXT: LBB0_11: ## %bb5657
; CHECK-NEXT: LBB0_9: ## %bb5657
; CHECK-NEXT: Ltmp13: ## EH_LABEL
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
@ -145,8 +144,8 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
; CHECK-NEXT: movl %eax, (%esp)
; CHECK-NEXT: calll __ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE
; CHECK-NEXT: Ltmp14: ## EH_LABEL
; CHECK-NEXT: jmp LBB0_6
; CHECK-NEXT: LBB0_22: ## %bb5968
; CHECK-NEXT: jmp LBB0_25
; CHECK-NEXT: LBB0_20: ## %bb5968
; CHECK-NEXT: Ltmp2: ## EH_LABEL
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
@ -154,24 +153,23 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
; CHECK-NEXT: calll __ZN8wxString6FormatEPKwz
; CHECK-NEXT: subl $4, %esp
; CHECK-NEXT: Ltmp3: ## EH_LABEL
; CHECK-NEXT: LBB0_6: ## %bb3267
; CHECK-NEXT: LBB0_25: ## %bb115.critedge.i
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: LBB0_7: ## %bb115.critedge.i
; CHECK-NEXT: addl $28, %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
; CHECK-NEXT: popl %ebp
; CHECK-NEXT: retl $4
; CHECK-NEXT: LBB0_25: ## %lpad.loopexit.split-lp
; CHECK-NEXT: LBB0_23: ## %lpad.loopexit.split-lp
; CHECK-NEXT: Ltmp15: ## EH_LABEL
; CHECK-NEXT: jmp LBB0_6
; CHECK-NEXT: LBB0_26: ## %lpad8185
; CHECK-NEXT: jmp LBB0_25
; CHECK-NEXT: LBB0_24: ## %lpad8185
; CHECK-NEXT: Ltmp12: ## EH_LABEL
; CHECK-NEXT: jmp LBB0_6
; CHECK-NEXT: LBB0_24: ## %lpad.loopexit
; CHECK-NEXT: jmp LBB0_25
; CHECK-NEXT: LBB0_22: ## %lpad.loopexit
; CHECK-NEXT: Ltmp18: ## EH_LABEL
; CHECK-NEXT: jmp LBB0_6
; CHECK-NEXT: jmp LBB0_25
; CHECK-NEXT: Lfunc_end0:
entry:
br i1 %foo, label %bb116.i, label %bb115.critedge.i