[AMDGPU][SIInsertWaitcnts] Track SCC. Insert KM_CNT waits for SCC writes. (#157843)

Add new event SCC_WRITE for s_barrier_signal_isfirst and s_barrier_leave,
instructions that write to SCC, counter is KM_CNT.
Also start tracking SCC for reads and writes.
s_barrier_wait on the same barrier guarantees that the SCC write from
s_barrier_signal_isfirst has landed, no need to insert s_wait_kmcnt.
This commit is contained in:
Petar Avramovic 2025-09-18 14:41:01 +02:00 committed by GitHub
parent 8dae17be29
commit 2ec7959b96
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 429 additions and 8 deletions

View File

@ -121,6 +121,7 @@ struct HardwareLimits {
DECL(LDS_ACCESS) /* lds read & write */ \
DECL(GDS_ACCESS) /* gds read & write */ \
DECL(SQ_MESSAGE) /* send message */ \
DECL(SCC_WRITE) /* write to SCC from barrier */ \
DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
DECL(SMEM_GROUP) /* scalar-memory group */ \
DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
@ -149,6 +150,7 @@ static constexpr StringLiteral WaitEventTypeName[] = {
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
// NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS .. SCC
// We reserve a fixed number of VGPR slots in the scoring tables for
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
enum RegisterMapping {
@ -163,6 +165,9 @@ enum RegisterMapping {
FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
NUM_LDS_VGPRS = 9, // One more than the stores we track.
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
// Remaining non-allocatable registers
SCC = NUM_ALL_ALLOCATABLE
};
// Enumerate different types of result-returning VMEM operations. Although
@ -401,7 +406,7 @@ public:
eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
eventMask({VMEM_SAMPLER_READ_ACCESS}),
eventMask({VMEM_BVH_READ_ACCESS}),
eventMask({SMEM_ACCESS, SQ_MESSAGE}),
eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
eventMask({VMEM_GROUP, SMEM_GROUP})};
return WaitEventMaskForInstGFX12Plus;
@ -586,6 +591,7 @@ public:
WaitcntBrackets &ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
static bool asynchronouslyWritesSCC(unsigned Opcode);
};
// This objects maintains the current score brackets of each wait counter, and
@ -626,7 +632,12 @@ public:
unsigned getRegScore(int GprNo, InstCounterType T) const {
if (GprNo < NUM_ALL_VGPRS)
return VgprScores[T][GprNo];
return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
if (GprNo < NUM_ALL_ALLOCATABLE)
return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
assert(GprNo == SCC);
return SCCScore;
}
bool merge(const WaitcntBrackets &Other);
@ -646,6 +657,7 @@ public:
AMDGPU::Waitcnt &Wait) const {
determineWait(T, {RegNo, RegNo + 1}, Wait);
}
void tryClearSCCWriteEvent(MachineInstr *Inst);
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
@ -785,6 +797,10 @@ private:
// Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
// X_CNT score.
unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
// Reg score for SCC.
unsigned SCCScore = 0;
// The unique instruction that has an SCC write pending, if there is one.
const MachineInstr *PendingSCCWrite = nullptr;
// Bitmask of the VmemTypes of VMEM instructions that might have a pending
// write to each vgpr.
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
@ -820,6 +836,9 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
const MachineRegisterInfo *MRI,
const SIRegisterInfo *TRI,
const MachineOperand &Op) const {
if (Op.getReg() == AMDGPU::SCC)
return {SCC, SCC + 1};
if (!TRI->isInAllocatableClass(Op.getReg()))
return {-1, -1};
@ -873,9 +892,12 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
if (RegNo < NUM_ALL_VGPRS) {
VgprUB = std::max(VgprUB, RegNo);
VgprScores[CntTy][RegNo] = Score;
} else {
} else if (RegNo < NUM_ALL_ALLOCATABLE) {
SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
} else {
assert(RegNo == SCC);
SCCScore = Score;
}
}
}
@ -1086,6 +1108,11 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
if (Slot)
setRegScore(FIRST_LDS_VGPR, T, CurrScore);
}
if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) {
setRegScore(SCC, T, CurrScore);
PendingSCCWrite = &Inst;
}
}
}
@ -1154,6 +1181,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
OS << RelScore << ":s" << J << " ";
}
}
if (T == KM_CNT && SCCScore > 0)
OS << SCCScore << ":scc ";
}
OS << '\n';
}
@ -1228,6 +1257,24 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
}
}
void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
// S_BARRIER_WAIT on the same barrier guarantees that the pending write to
// SCC has landed
if (PendingSCCWrite &&
PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
// If this SCC_WRITE is the only pending KM_CNT event, clear counter.
if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
SCC_WRITE_PendingEvent) {
setScoreLB(KM_CNT, getScoreUB(KM_CNT));
}
PendingEvents &= ~SCC_WRITE_PendingEvent;
PendingSCCWrite = nullptr;
}
}
void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
applyWaitcnt(EXP_CNT, Wait.ExpCnt);
@ -1917,6 +1964,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait);
}
}
} else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) {
ScoreBrackets.tryClearSCCWriteEvent(&MI);
} else {
// FIXME: Should not be relying on memoperands.
// Look at the source operands of every instruction to see if
@ -2006,6 +2055,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
}
ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
} else if (Op.getReg() == AMDGPU::SCC) {
ScoreBrackets.determineWait(KM_CNT, Interval, Wait);
} else {
ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
}
@ -2343,6 +2394,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
else
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
} else if (asynchronouslyWritesSCC(Inst.getOpcode())) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst);
} else {
switch (Inst.getOpcode()) {
case AMDGPU::S_SENDMSG:
@ -2353,9 +2406,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
break;
case AMDGPU::S_MEMTIME:
case AMDGPU::S_MEMREALTIME:
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
case AMDGPU::S_BARRIER_LEAVE:
case AMDGPU::S_GET_BARRIER_STATE_M0:
case AMDGPU::S_GET_BARRIER_STATE_IMM:
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
@ -2422,6 +2472,19 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
if (T == DS_CNT)
StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
if (T == KM_CNT) {
StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
if (Other.hasPendingEvent(SCC_WRITE)) {
unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
if (!OldEventsHasSCCWrite) {
PendingSCCWrite = Other.PendingSCCWrite;
} else {
if (PendingSCCWrite != Other.PendingSCCWrite)
PendingSCCWrite = nullptr;
}
}
}
for (int J = 0; J <= VgprUB; J++)
StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
@ -2453,6 +2516,12 @@ static bool isWaitInstr(MachineInstr &Inst) {
counterTypeForInstr(Opcode).has_value();
}
bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) {
return Opcode == AMDGPU::S_BARRIER_LEAVE ||
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
}
// Generate s_waitcnt instructions where needed.
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
MachineBasicBlock &Block,

View File

@ -12,10 +12,10 @@ define i1 @func1() {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: func1:
@ -27,13 +27,86 @@ define i1 @func1() {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
ret i1 %r
}
define i1 @signal_isfirst_same_barrier_wait() {
; GFX12-SDAG-LABEL: signal_isfirst_same_barrier_wait:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
; GFX12-SDAG-NEXT: s_barrier_wait -1
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: signal_isfirst_same_barrier_wait:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
; GFX12-GISEL-NEXT: s_barrier_wait -1
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
call void @llvm.amdgcn.s.barrier.wait(i16 -1)
ret i1 %r
}
define i1 @signal_isfirst_different_barrier_wait() {
; GFX12-SDAG-LABEL: signal_isfirst_different_barrier_wait:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
; GFX12-SDAG-NEXT: s_barrier_wait 0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: signal_isfirst_different_barrier_wait:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
; GFX12-GISEL-NEXT: s_barrier_wait 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
call void @llvm.amdgcn.s.barrier.wait(i16 0)
ret i1 %r
}
declare void @llvm.amdgcn.s.barrier.wait(i16)
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32)

View File

@ -0,0 +1,105 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
---
name: signal_isfirst_imm_same_barrier_wait
body: |
bb.0:
; GCN-LABEL: name: signal_isfirst_imm_same_barrier_wait
; GCN: S_WAIT_LOADCNT_DSCNT 0
; GCN-NEXT: S_WAIT_EXPCNT 0
; GCN-NEXT: S_WAIT_SAMPLECNT 0
; GCN-NEXT: S_WAIT_BVHCNT 0
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
; GCN-NEXT: S_BARRIER_WAIT -1
; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
S_CMP_EQ_U32 0, 0, implicit-def $scc
S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
S_BARRIER_WAIT -1
renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
...
---
name: signal_isfirst_imm_different_barrier_wait
body: |
bb.0:
; GCN-LABEL: name: signal_isfirst_imm_different_barrier_wait
; GCN: S_WAIT_LOADCNT_DSCNT 0
; GCN-NEXT: S_WAIT_EXPCNT 0
; GCN-NEXT: S_WAIT_SAMPLECNT 0
; GCN-NEXT: S_WAIT_BVHCNT 0
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
; GCN-NEXT: S_BARRIER_WAIT 0
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
S_CMP_EQ_U32 0, 0, implicit-def $scc
S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
S_BARRIER_WAIT 0
renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
...
---
name: signal_isfirst_m0_same_barrier_wait
body: |
bb.0:
; GCN-LABEL: name: signal_isfirst_m0_same_barrier_wait
; GCN: S_WAIT_LOADCNT_DSCNT 0
; GCN-NEXT: S_WAIT_EXPCNT 0
; GCN-NEXT: S_WAIT_SAMPLECNT 0
; GCN-NEXT: S_WAIT_BVHCNT 0
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GCN-NEXT: $m0 = S_MOV_B32 -1
; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
; GCN-NEXT: S_BARRIER_WAIT -1
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
S_CMP_EQ_U32 0, 0, implicit-def $scc
$m0 = S_MOV_B32 -1
S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
S_BARRIER_WAIT -1
renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
...
---
name: signal_isfirst_m0_different_barrier_wait
body: |
bb.0:
; GCN-LABEL: name: signal_isfirst_m0_different_barrier_wait
; GCN: S_WAIT_LOADCNT_DSCNT 0
; GCN-NEXT: S_WAIT_EXPCNT 0
; GCN-NEXT: S_WAIT_SAMPLECNT 0
; GCN-NEXT: S_WAIT_BVHCNT 0
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; GCN-NEXT: $m0 = S_MOV_B32 -1
; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
; GCN-NEXT: S_BARRIER_WAIT 0
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
S_CMP_EQ_U32 0, 0, implicit-def $scc
$m0 = S_MOV_B32 -1
S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
S_BARRIER_WAIT 0
renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
...

View File

@ -155,6 +155,7 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-GISEL-NEXT: s_barrier_signal -1
; GFX12-GISEL-NEXT: s_barrier_join m0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_barrier_leave

View File

@ -0,0 +1,173 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s
---
name: scc_write_in_other_block
body: |
; GFX12-LABEL: name: scc_write_in_other_block
; GFX12: bb.0:
; GFX12-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0
; GFX12-NEXT: S_WAIT_EXPCNT 0
; GFX12-NEXT: S_WAIT_SAMPLECNT 0
; GFX12-NEXT: S_WAIT_BVHCNT 0
; GFX12-NEXT: S_WAIT_KMCNT 0
; GFX12-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX12-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.1:
; GFX12-NEXT: successors: %bb.2(0x80000000)
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc
; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
; GFX12-NEXT: S_WAIT_KMCNT 0
; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
; GFX12-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
; GFX12-NEXT: S_ENDPGM 0
bb.0:
S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
S_CBRANCH_EXECZ %bb.2, implicit $exec
bb.1:
renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc
S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
$vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec
bb.2:
renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
$vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
S_ENDPGM 0
...
---
name: scc_write_in_other_block_with_barrier_wait
body: |
; GFX12-LABEL: name: scc_write_in_other_block_with_barrier_wait
; GFX12: bb.0:
; GFX12-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0
; GFX12-NEXT: S_WAIT_EXPCNT 0
; GFX12-NEXT: S_WAIT_SAMPLECNT 0
; GFX12-NEXT: S_WAIT_BVHCNT 0
; GFX12-NEXT: S_WAIT_KMCNT 0
; GFX12-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX12-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.1:
; GFX12-NEXT: successors: %bb.2(0x80000000)
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc
; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
; GFX12-NEXT: S_BARRIER_WAIT -1
; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
; GFX12-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
; GFX12-NEXT: S_ENDPGM 0
bb.0:
S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
S_CBRANCH_EXECZ %bb.2, implicit $exec
bb.1:
renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc
S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
$vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec
bb.2:
S_BARRIER_WAIT -1
renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
$vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
S_ENDPGM 0
...
---
name: scc_write_in_multiple_blocks_with_barrier_wait
body: |
; GFX12-LABEL: name: scc_write_in_multiple_blocks_with_barrier_wait
; GFX12: bb.0:
; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000)
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0
; GFX12-NEXT: S_WAIT_EXPCNT 0
; GFX12-NEXT: S_WAIT_SAMPLECNT 0
; GFX12-NEXT: S_WAIT_BVHCNT 0
; GFX12-NEXT: S_WAIT_KMCNT 0
; GFX12-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX12-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.1:
; GFX12-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr1, implicit-def $exec, implicit $exec
; GFX12-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.2:
; GFX12-NEXT: successors: %bb.5(0x80000000)
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM 0, implicit-def $scc, implicit killed $scc
; GFX12-NEXT: S_BRANCH %bb.5
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.3:
; GFX12-NEXT: successors: %bb.5(0x80000000)
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM 1, implicit-def $scc, implicit killed $scc
; GFX12-NEXT: S_BRANCH %bb.5
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.4:
; GFX12-NEXT: successors: %bb.5(0x80000000)
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: bb.5:
; GFX12-NEXT: S_BARRIER_WAIT -1
; GFX12-NEXT: S_WAIT_KMCNT 0
; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
; GFX12-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
; GFX12-NEXT: S_ENDPGM 0
bb.0:
S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
S_CBRANCH_EXECZ %bb.4, implicit $exec
bb.1:
V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr1, implicit-def $exec, implicit $exec
S_CBRANCH_EXECZ %bb.3, implicit $exec
bb.2:
S_BARRIER_SIGNAL_ISFIRST_IMM 0, implicit-def $scc, implicit killed $scc
S_BRANCH %bb.5
bb.3:
S_BARRIER_SIGNAL_ISFIRST_IMM 1, implicit-def $scc, implicit killed $scc
S_BRANCH %bb.5
bb.4:
S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
bb.5:
S_BARRIER_WAIT -1
renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
$vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
S_ENDPGM 0
...