[AMDGPU][SIInsertWaitcnts] Track SCC. Insert KM_CNT waits for SCC writes. (#157843)
Add new event SCC_WRITE for s_barrier_signal_isfirst and s_barrier_leave, instructions that write to SCC, counter is KM_CNT. Also start tracking SCC for reads and writes. s_barrier_wait on the same barrier guarantees that the SCC write from s_barrier_signal_isfirst has landed, no need to insert s_wait_kmcnt.
This commit is contained in:
parent
8dae17be29
commit
2ec7959b96
@ -121,6 +121,7 @@ struct HardwareLimits {
|
||||
DECL(LDS_ACCESS) /* lds read & write */ \
|
||||
DECL(GDS_ACCESS) /* gds read & write */ \
|
||||
DECL(SQ_MESSAGE) /* send message */ \
|
||||
DECL(SCC_WRITE) /* write to SCC from barrier */ \
|
||||
DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
|
||||
DECL(SMEM_GROUP) /* scalar-memory group */ \
|
||||
DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
|
||||
@ -149,6 +150,7 @@ static constexpr StringLiteral WaitEventTypeName[] = {
|
||||
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
|
||||
// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
|
||||
// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
|
||||
// NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS .. SCC
|
||||
// We reserve a fixed number of VGPR slots in the scoring tables for
|
||||
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
|
||||
enum RegisterMapping {
|
||||
@ -163,6 +165,9 @@ enum RegisterMapping {
|
||||
FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
|
||||
NUM_LDS_VGPRS = 9, // One more than the stores we track.
|
||||
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
|
||||
NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
|
||||
// Remaining non-allocatable registers
|
||||
SCC = NUM_ALL_ALLOCATABLE
|
||||
};
|
||||
|
||||
// Enumerate different types of result-returning VMEM operations. Although
|
||||
@ -401,7 +406,7 @@ public:
|
||||
eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
|
||||
eventMask({VMEM_SAMPLER_READ_ACCESS}),
|
||||
eventMask({VMEM_BVH_READ_ACCESS}),
|
||||
eventMask({SMEM_ACCESS, SQ_MESSAGE}),
|
||||
eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
|
||||
eventMask({VMEM_GROUP, SMEM_GROUP})};
|
||||
|
||||
return WaitEventMaskForInstGFX12Plus;
|
||||
@ -586,6 +591,7 @@ public:
|
||||
WaitcntBrackets &ScoreBrackets);
|
||||
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
|
||||
WaitcntBrackets &ScoreBrackets);
|
||||
static bool asynchronouslyWritesSCC(unsigned Opcode);
|
||||
};
|
||||
|
||||
// This objects maintains the current score brackets of each wait counter, and
|
||||
@ -626,7 +632,12 @@ public:
|
||||
unsigned getRegScore(int GprNo, InstCounterType T) const {
|
||||
if (GprNo < NUM_ALL_VGPRS)
|
||||
return VgprScores[T][GprNo];
|
||||
return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
|
||||
|
||||
if (GprNo < NUM_ALL_ALLOCATABLE)
|
||||
return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
|
||||
|
||||
assert(GprNo == SCC);
|
||||
return SCCScore;
|
||||
}
|
||||
|
||||
bool merge(const WaitcntBrackets &Other);
|
||||
@ -646,6 +657,7 @@ public:
|
||||
AMDGPU::Waitcnt &Wait) const {
|
||||
determineWait(T, {RegNo, RegNo + 1}, Wait);
|
||||
}
|
||||
void tryClearSCCWriteEvent(MachineInstr *Inst);
|
||||
|
||||
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
|
||||
void applyWaitcnt(InstCounterType T, unsigned Count);
|
||||
@ -785,6 +797,10 @@ private:
|
||||
// Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
|
||||
// X_CNT score.
|
||||
unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
|
||||
// Reg score for SCC.
|
||||
unsigned SCCScore = 0;
|
||||
// The unique instruction that has an SCC write pending, if there is one.
|
||||
const MachineInstr *PendingSCCWrite = nullptr;
|
||||
// Bitmask of the VmemTypes of VMEM instructions that might have a pending
|
||||
// write to each vgpr.
|
||||
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
|
||||
@ -820,6 +836,9 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
|
||||
const MachineRegisterInfo *MRI,
|
||||
const SIRegisterInfo *TRI,
|
||||
const MachineOperand &Op) const {
|
||||
if (Op.getReg() == AMDGPU::SCC)
|
||||
return {SCC, SCC + 1};
|
||||
|
||||
if (!TRI->isInAllocatableClass(Op.getReg()))
|
||||
return {-1, -1};
|
||||
|
||||
@ -873,9 +892,12 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
|
||||
if (RegNo < NUM_ALL_VGPRS) {
|
||||
VgprUB = std::max(VgprUB, RegNo);
|
||||
VgprScores[CntTy][RegNo] = Score;
|
||||
} else {
|
||||
} else if (RegNo < NUM_ALL_ALLOCATABLE) {
|
||||
SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
|
||||
SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
|
||||
} else {
|
||||
assert(RegNo == SCC);
|
||||
SCCScore = Score;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1086,6 +1108,11 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
|
||||
if (Slot)
|
||||
setRegScore(FIRST_LDS_VGPR, T, CurrScore);
|
||||
}
|
||||
|
||||
if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) {
|
||||
setRegScore(SCC, T, CurrScore);
|
||||
PendingSCCWrite = &Inst;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1154,6 +1181,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
|
||||
OS << RelScore << ":s" << J << " ";
|
||||
}
|
||||
}
|
||||
if (T == KM_CNT && SCCScore > 0)
|
||||
OS << SCCScore << ":scc ";
|
||||
}
|
||||
OS << '\n';
|
||||
}
|
||||
@ -1228,6 +1257,24 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
|
||||
}
|
||||
}
|
||||
|
||||
void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
|
||||
// S_BARRIER_WAIT on the same barrier guarantees that the pending write to
|
||||
// SCC has landed
|
||||
if (PendingSCCWrite &&
|
||||
PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
|
||||
PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
|
||||
unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
|
||||
// If this SCC_WRITE is the only pending KM_CNT event, clear counter.
|
||||
if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
|
||||
SCC_WRITE_PendingEvent) {
|
||||
setScoreLB(KM_CNT, getScoreUB(KM_CNT));
|
||||
}
|
||||
|
||||
PendingEvents &= ~SCC_WRITE_PendingEvent;
|
||||
PendingSCCWrite = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
|
||||
applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
|
||||
applyWaitcnt(EXP_CNT, Wait.ExpCnt);
|
||||
@ -1917,6 +1964,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
|
||||
Wait);
|
||||
}
|
||||
}
|
||||
} else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) {
|
||||
ScoreBrackets.tryClearSCCWriteEvent(&MI);
|
||||
} else {
|
||||
// FIXME: Should not be relying on memoperands.
|
||||
// Look at the source operands of every instruction to see if
|
||||
@ -2006,6 +2055,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
|
||||
ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
|
||||
}
|
||||
ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
|
||||
} else if (Op.getReg() == AMDGPU::SCC) {
|
||||
ScoreBrackets.determineWait(KM_CNT, Interval, Wait);
|
||||
} else {
|
||||
ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
|
||||
}
|
||||
@ -2343,6 +2394,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
|
||||
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
|
||||
else
|
||||
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
|
||||
} else if (asynchronouslyWritesSCC(Inst.getOpcode())) {
|
||||
ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst);
|
||||
} else {
|
||||
switch (Inst.getOpcode()) {
|
||||
case AMDGPU::S_SENDMSG:
|
||||
@ -2353,9 +2406,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
|
||||
break;
|
||||
case AMDGPU::S_MEMTIME:
|
||||
case AMDGPU::S_MEMREALTIME:
|
||||
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
|
||||
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
|
||||
case AMDGPU::S_BARRIER_LEAVE:
|
||||
case AMDGPU::S_GET_BARRIER_STATE_M0:
|
||||
case AMDGPU::S_GET_BARRIER_STATE_IMM:
|
||||
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
|
||||
@ -2422,6 +2472,19 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
|
||||
if (T == DS_CNT)
|
||||
StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
|
||||
|
||||
if (T == KM_CNT) {
|
||||
StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
|
||||
if (Other.hasPendingEvent(SCC_WRITE)) {
|
||||
unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
|
||||
if (!OldEventsHasSCCWrite) {
|
||||
PendingSCCWrite = Other.PendingSCCWrite;
|
||||
} else {
|
||||
if (PendingSCCWrite != Other.PendingSCCWrite)
|
||||
PendingSCCWrite = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int J = 0; J <= VgprUB; J++)
|
||||
StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
|
||||
|
||||
@ -2453,6 +2516,12 @@ static bool isWaitInstr(MachineInstr &Inst) {
|
||||
counterTypeForInstr(Opcode).has_value();
|
||||
}
|
||||
|
||||
bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) {
|
||||
return Opcode == AMDGPU::S_BARRIER_LEAVE ||
|
||||
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
|
||||
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
|
||||
}
|
||||
|
||||
// Generate s_waitcnt instructions where needed.
|
||||
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
|
||||
MachineBasicBlock &Block,
|
||||
|
||||
@ -12,10 +12,10 @@ define i1 @func1() {
|
||||
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
|
||||
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
|
||||
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX12-GISEL-LABEL: func1:
|
||||
@ -27,13 +27,86 @@ define i1 @func1() {
|
||||
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
|
||||
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
|
||||
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
|
||||
ret i1 %r
|
||||
}
|
||||
|
||||
define i1 @signal_isfirst_same_barrier_wait() {
|
||||
; GFX12-SDAG-LABEL: signal_isfirst_same_barrier_wait:
|
||||
; GFX12-SDAG: ; %bb.0:
|
||||
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
|
||||
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
|
||||
; GFX12-SDAG-NEXT: s_barrier_wait -1
|
||||
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX12-GISEL-LABEL: signal_isfirst_same_barrier_wait:
|
||||
; GFX12-GISEL: ; %bb.0:
|
||||
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
|
||||
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
|
||||
; GFX12-GISEL-NEXT: s_barrier_wait -1
|
||||
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
|
||||
call void @llvm.amdgcn.s.barrier.wait(i16 -1)
|
||||
ret i1 %r
|
||||
}
|
||||
|
||||
define i1 @signal_isfirst_different_barrier_wait() {
|
||||
; GFX12-SDAG-LABEL: signal_isfirst_different_barrier_wait:
|
||||
; GFX12-SDAG: ; %bb.0:
|
||||
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
|
||||
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
|
||||
; GFX12-SDAG-NEXT: s_barrier_wait 0
|
||||
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX12-GISEL-LABEL: signal_isfirst_different_barrier_wait:
|
||||
; GFX12-GISEL: ; %bb.0:
|
||||
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
|
||||
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
|
||||
; GFX12-GISEL-NEXT: s_barrier_wait 0
|
||||
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
|
||||
call void @llvm.amdgcn.s.barrier.wait(i16 0)
|
||||
ret i1 %r
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.s.barrier.wait(i16)
|
||||
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32)
|
||||
|
||||
@ -0,0 +1,105 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
|
||||
|
||||
---
|
||||
name: signal_isfirst_imm_same_barrier_wait
|
||||
body: |
|
||||
bb.0:
|
||||
; GCN-LABEL: name: signal_isfirst_imm_same_barrier_wait
|
||||
; GCN: S_WAIT_LOADCNT_DSCNT 0
|
||||
; GCN-NEXT: S_WAIT_EXPCNT 0
|
||||
; GCN-NEXT: S_WAIT_SAMPLECNT 0
|
||||
; GCN-NEXT: S_WAIT_BVHCNT 0
|
||||
; GCN-NEXT: S_WAIT_KMCNT 0
|
||||
; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
|
||||
; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
|
||||
; GCN-NEXT: S_BARRIER_WAIT -1
|
||||
; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
|
||||
; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
|
||||
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
|
||||
S_CMP_EQ_U32 0, 0, implicit-def $scc
|
||||
S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
|
||||
S_BARRIER_WAIT -1
|
||||
renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
|
||||
renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
|
||||
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
|
||||
...
|
||||
|
||||
---
|
||||
name: signal_isfirst_imm_different_barrier_wait
|
||||
body: |
|
||||
bb.0:
|
||||
; GCN-LABEL: name: signal_isfirst_imm_different_barrier_wait
|
||||
; GCN: S_WAIT_LOADCNT_DSCNT 0
|
||||
; GCN-NEXT: S_WAIT_EXPCNT 0
|
||||
; GCN-NEXT: S_WAIT_SAMPLECNT 0
|
||||
; GCN-NEXT: S_WAIT_BVHCNT 0
|
||||
; GCN-NEXT: S_WAIT_KMCNT 0
|
||||
; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
|
||||
; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
|
||||
; GCN-NEXT: S_BARRIER_WAIT 0
|
||||
; GCN-NEXT: S_WAIT_KMCNT 0
|
||||
; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
|
||||
; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
|
||||
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
|
||||
S_CMP_EQ_U32 0, 0, implicit-def $scc
|
||||
S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
|
||||
S_BARRIER_WAIT 0
|
||||
renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
|
||||
renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
|
||||
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
|
||||
...
|
||||
|
||||
---
|
||||
name: signal_isfirst_m0_same_barrier_wait
|
||||
body: |
|
||||
bb.0:
|
||||
; GCN-LABEL: name: signal_isfirst_m0_same_barrier_wait
|
||||
; GCN: S_WAIT_LOADCNT_DSCNT 0
|
||||
; GCN-NEXT: S_WAIT_EXPCNT 0
|
||||
; GCN-NEXT: S_WAIT_SAMPLECNT 0
|
||||
; GCN-NEXT: S_WAIT_BVHCNT 0
|
||||
; GCN-NEXT: S_WAIT_KMCNT 0
|
||||
; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
|
||||
; GCN-NEXT: $m0 = S_MOV_B32 -1
|
||||
; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
|
||||
; GCN-NEXT: S_BARRIER_WAIT -1
|
||||
; GCN-NEXT: S_WAIT_KMCNT 0
|
||||
; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
|
||||
; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
|
||||
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
|
||||
S_CMP_EQ_U32 0, 0, implicit-def $scc
|
||||
$m0 = S_MOV_B32 -1
|
||||
S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
|
||||
S_BARRIER_WAIT -1
|
||||
renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
|
||||
renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
|
||||
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
|
||||
...
|
||||
|
||||
---
|
||||
name: signal_isfirst_m0_different_barrier_wait
|
||||
body: |
|
||||
bb.0:
|
||||
; GCN-LABEL: name: signal_isfirst_m0_different_barrier_wait
|
||||
; GCN: S_WAIT_LOADCNT_DSCNT 0
|
||||
; GCN-NEXT: S_WAIT_EXPCNT 0
|
||||
; GCN-NEXT: S_WAIT_SAMPLECNT 0
|
||||
; GCN-NEXT: S_WAIT_BVHCNT 0
|
||||
; GCN-NEXT: S_WAIT_KMCNT 0
|
||||
; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
|
||||
; GCN-NEXT: $m0 = S_MOV_B32 -1
|
||||
; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
|
||||
; GCN-NEXT: S_BARRIER_WAIT 0
|
||||
; GCN-NEXT: S_WAIT_KMCNT 0
|
||||
; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
|
||||
; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
|
||||
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
|
||||
S_CMP_EQ_U32 0, 0, implicit-def $scc
|
||||
$m0 = S_MOV_B32 -1
|
||||
S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
|
||||
S_BARRIER_WAIT 0
|
||||
renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
|
||||
renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
|
||||
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
|
||||
...
|
||||
@ -155,6 +155,7 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
|
||||
; GFX12-GISEL-NEXT: s_barrier_signal -1
|
||||
; GFX12-GISEL-NEXT: s_barrier_join m0
|
||||
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
|
||||
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48
|
||||
; GFX12-GISEL-NEXT: s_barrier_wait 1
|
||||
; GFX12-GISEL-NEXT: s_barrier_leave
|
||||
|
||||
173
llvm/test/CodeGen/AMDGPU/waitcnt-kmcnt-scc-different-block.mir
Normal file
173
llvm/test/CodeGen/AMDGPU/waitcnt-kmcnt-scc-different-block.mir
Normal file
@ -0,0 +1,173 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s
|
||||
|
||||
---
|
||||
name: scc_write_in_other_block
|
||||
body: |
|
||||
; GFX12-LABEL: name: scc_write_in_other_block
|
||||
; GFX12: bb.0:
|
||||
; GFX12-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0
|
||||
; GFX12-NEXT: S_WAIT_EXPCNT 0
|
||||
; GFX12-NEXT: S_WAIT_SAMPLECNT 0
|
||||
; GFX12-NEXT: S_WAIT_BVHCNT 0
|
||||
; GFX12-NEXT: S_WAIT_KMCNT 0
|
||||
; GFX12-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
|
||||
; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
|
||||
; GFX12-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: bb.1:
|
||||
; GFX12-NEXT: successors: %bb.2(0x80000000)
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc
|
||||
; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
|
||||
; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
|
||||
; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: bb.2:
|
||||
; GFX12-NEXT: S_WAIT_KMCNT 0
|
||||
; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
|
||||
; GFX12-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
|
||||
; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: S_ENDPGM 0
|
||||
bb.0:
|
||||
S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
|
||||
V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
|
||||
S_CBRANCH_EXECZ %bb.2, implicit $exec
|
||||
|
||||
bb.1:
|
||||
renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc
|
||||
S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
|
||||
$vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
|
||||
GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec
|
||||
|
||||
bb.2:
|
||||
renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
|
||||
$vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
|
||||
GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: scc_write_in_other_block_with_barrier_wait
|
||||
body: |
|
||||
; GFX12-LABEL: name: scc_write_in_other_block_with_barrier_wait
|
||||
; GFX12: bb.0:
|
||||
; GFX12-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0
|
||||
; GFX12-NEXT: S_WAIT_EXPCNT 0
|
||||
; GFX12-NEXT: S_WAIT_SAMPLECNT 0
|
||||
; GFX12-NEXT: S_WAIT_BVHCNT 0
|
||||
; GFX12-NEXT: S_WAIT_KMCNT 0
|
||||
; GFX12-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
|
||||
; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
|
||||
; GFX12-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: bb.1:
|
||||
; GFX12-NEXT: successors: %bb.2(0x80000000)
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc
|
||||
; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
|
||||
; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
|
||||
; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: bb.2:
|
||||
; GFX12-NEXT: S_BARRIER_WAIT -1
|
||||
; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
|
||||
; GFX12-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
|
||||
; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: S_ENDPGM 0
|
||||
bb.0:
|
||||
S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
|
||||
V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
|
||||
S_CBRANCH_EXECZ %bb.2, implicit $exec
|
||||
|
||||
bb.1:
|
||||
renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc
|
||||
S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
|
||||
$vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
|
||||
GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec
|
||||
|
||||
bb.2:
|
||||
S_BARRIER_WAIT -1
|
||||
renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
|
||||
$vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
|
||||
GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: scc_write_in_multiple_blocks_with_barrier_wait
|
||||
body: |
|
||||
; GFX12-LABEL: name: scc_write_in_multiple_blocks_with_barrier_wait
|
||||
; GFX12: bb.0:
|
||||
; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000)
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0
|
||||
; GFX12-NEXT: S_WAIT_EXPCNT 0
|
||||
; GFX12-NEXT: S_WAIT_SAMPLECNT 0
|
||||
; GFX12-NEXT: S_WAIT_BVHCNT 0
|
||||
; GFX12-NEXT: S_WAIT_KMCNT 0
|
||||
; GFX12-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
|
||||
; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
|
||||
; GFX12-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: bb.1:
|
||||
; GFX12-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr1, implicit-def $exec, implicit $exec
|
||||
; GFX12-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: bb.2:
|
||||
; GFX12-NEXT: successors: %bb.5(0x80000000)
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM 0, implicit-def $scc, implicit killed $scc
|
||||
; GFX12-NEXT: S_BRANCH %bb.5
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: bb.3:
|
||||
; GFX12-NEXT: successors: %bb.5(0x80000000)
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM 1, implicit-def $scc, implicit killed $scc
|
||||
; GFX12-NEXT: S_BRANCH %bb.5
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: bb.4:
|
||||
; GFX12-NEXT: successors: %bb.5(0x80000000)
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: bb.5:
|
||||
; GFX12-NEXT: S_BARRIER_WAIT -1
|
||||
; GFX12-NEXT: S_WAIT_KMCNT 0
|
||||
; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
|
||||
; GFX12-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
|
||||
; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
|
||||
; GFX12-NEXT: S_ENDPGM 0
|
||||
bb.0:
|
||||
S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
|
||||
V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
|
||||
S_CBRANCH_EXECZ %bb.4, implicit $exec
|
||||
|
||||
bb.1:
|
||||
V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr1, implicit-def $exec, implicit $exec
|
||||
S_CBRANCH_EXECZ %bb.3, implicit $exec
|
||||
|
||||
bb.2:
|
||||
S_BARRIER_SIGNAL_ISFIRST_IMM 0, implicit-def $scc, implicit killed $scc
|
||||
S_BRANCH %bb.5
|
||||
|
||||
bb.3:
|
||||
S_BARRIER_SIGNAL_ISFIRST_IMM 1, implicit-def $scc, implicit killed $scc
|
||||
S_BRANCH %bb.5
|
||||
|
||||
bb.4:
|
||||
S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
|
||||
|
||||
bb.5:
|
||||
S_BARRIER_WAIT -1
|
||||
renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
|
||||
$vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
|
||||
GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
...
|
||||
Loading…
x
Reference in New Issue
Block a user