From 2dcd75eb4420c8a5af13b80a065c03689100c96f Mon Sep 17 00:00:00 2001 From: Vigneshwar Jayakumar Date: Thu, 5 Feb 2026 10:13:51 -0600 Subject: [PATCH] [AMDGPU] Fix missing waitcnt after buffer_wbl2 (#178316) On GFX9, BUFFER_WBL2 is used to write back dirty cache lines and requires an s_waitcnt vmcnt(0) afterwards to ensure completion. This patch fixes by incrementing vmcnt for buffer_wbl2 instruction --------- Co-authored-by: Jay Foad --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 6 +- .../AMDGPU/GlobalISel/atomicrmw_fmax.ll | 12 ++ .../AMDGPU/GlobalISel/atomicrmw_fmin.ll | 12 ++ .../AMDGPU/GlobalISel/fp-atomics-gfx942.ll | 7 + .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 30 ++++- .../CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll | 24 ++++ .../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 102 ++++++++++++++ .../AMDGPU/a-v-global-atomic-cmpxchg.ll | 24 ++++ .../CodeGen/AMDGPU/a-v-global-atomicrmw.ll | 102 ++++++++++++++ llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 2 + .../buffer-fat-pointer-atomicrmw-fadd.ll | 37 +++++- .../buffer-fat-pointer-atomicrmw-fmax.ll | 25 +++- .../buffer-fat-pointer-atomicrmw-fmin.ll | 25 +++- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 94 +++++++++++++ .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 74 +++++++++++ .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 74 +++++++++++ .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 70 ++++++++++ .../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 124 ++++++++++++++++++ llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll | 3 + .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 30 ++++- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 104 +++++++++++++++ .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 74 +++++++++++ .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 74 +++++++++++ .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 70 ++++++++++ .../test/CodeGen/AMDGPU/idemponent-atomics.ll | 21 +-- .../insert_waitcnt_for_precise_memory.ll | 2 + .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 40 +++--- .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 16 ++- llvm/test/CodeGen/AMDGPU/waitcnt-wbl2.ll | 57 ++++++++ 29 files changed, 1281 insertions(+), 54 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/waitcnt-wbl2.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 9a2a7b492388..6f9328f49846 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2783,7 +2783,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1) ScoreBrackets->setPendingFlat(); } else if (SIInstrInfo::isVMEM(Inst) && - !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) { + (!AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode()) || + Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) { + // BUFFER_WBL2 is included here because unlike invalidates, has to be + // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has + // completed. IsVMEMAccess = true; ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 32f539a267e6..9adb56cb0861 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -333,6 +333,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -478,6 +479,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -630,6 +632,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -785,6 +788,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -924,6 +928,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1065,6 +1070,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1216,6 +1222,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1369,6 +1376,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1513,6 +1521,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX942-NEXT: v_max_f32_e32 v4, v0, v3 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1667,6 +1676,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX942-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1831,6 +1841,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1997,6 +2008,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index be0ef85b217d..876eacb76369 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -333,6 +333,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -478,6 +479,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -630,6 +632,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -785,6 +788,7 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -924,6 +928,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1065,6 +1070,7 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1216,6 +1222,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1369,6 +1376,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1513,6 +1521,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX942-NEXT: v_min_f32_e32 v4, v0, v3 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1667,6 +1676,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX942-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1831,6 +1841,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1997,6 +2008,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll index 624b20afeb37..99c3765b0fd1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll @@ -9,6 +9,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -25,6 +26,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -39,6 +41,7 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -76,6 +79,7 @@ define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -90,6 +94,7 @@ define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr, ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -104,6 +109,7 @@ define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half> ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -118,6 +124,7 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 983059bf184f..2bdbdb93d914 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1490,7 +1490,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1513,7 +1513,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX942-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1587,7 +1587,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX942-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1640,7 +1640,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1663,7 +1663,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX942-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1737,7 +1737,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX942-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1781,6 +1781,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1792,6 +1793,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1831,6 +1833,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1861,6 +1864,7 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1872,6 +1876,7 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1932,7 +1937,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX942-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1978,6 +1983,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1991,6 +1997,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2035,6 +2042,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2068,6 +2076,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2081,6 +2090,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2112,6 +2122,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2123,6 +2134,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2162,6 +2174,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2192,6 +2205,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2204,6 +2218,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2248,6 +2263,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll index e882769f97ac..4f40948cab0a 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll @@ -16,6 +16,7 @@ define void @flat_atomic_cmpxchg_i32_ret_av_av__av(ptr %ptr) #0 { ; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -44,6 +45,7 @@ define void @flat_atomic_cmpxchg_i32_ret_av_av__v(ptr %ptr) #0 { ; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -72,6 +74,7 @@ define void @flat_atomic_cmpxchg_i32_ret_av_av__a(ptr %ptr) #0 { ; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -103,6 +106,7 @@ define void @flat_atomic_cmpxchg_i32_ret_a_a__a(ptr %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -134,6 +138,7 @@ define void @flat_atomic_cmpxchg_i32_ret_a_a__v(ptr %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -165,6 +170,7 @@ define void @flat_atomic_cmpxchg_i32_ret_v_a__v(ptr %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -194,6 +200,7 @@ define void @flat_atomic_cmpxchg_i32_ret_a_v__v(ptr %ptr) #0 { ; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -222,6 +229,7 @@ define void @flat_atomic_cmpxchg_i32_ret_v_v__a(ptr %ptr) #0 { ; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -251,6 +259,7 @@ define void @flat_atomic_cmpxchg_i32_ret_av_v__av(ptr %ptr) #0 { ; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -279,6 +288,7 @@ define void @flat_atomic_cmpxchg_i32_ret_v_av__av(ptr %ptr) #0 { ; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -309,6 +319,7 @@ define void @flat_atomic_cmpxchg_i32_ret_av_a__av(ptr %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -339,6 +350,7 @@ define void @flat_atomic_cmpxchg_i32_ret_a_av__av(ptr %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -380,6 +392,7 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__av(ptr %ptr) #0 { ; CHECK-NEXT: s_cbranch_execz .LBB12_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -436,6 +449,7 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__v(ptr %ptr) #0 { ; CHECK-NEXT: s_cbranch_execz .LBB13_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -492,6 +506,7 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__a(ptr %ptr) #0 { ; CHECK-NEXT: s_cbranch_execz .LBB14_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -561,6 +576,7 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__a(ptr %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v4, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v5, a3 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -631,6 +647,7 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__v(ptr %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -692,6 +709,7 @@ define void @flat_atomic_cmpxchg_i64_ret_v_a__v(ptr %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -753,6 +771,7 @@ define void @flat_atomic_cmpxchg_i64_ret_a_v__v(ptr %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -810,6 +829,7 @@ define void @flat_atomic_cmpxchg_i64_ret_v_v__a(ptr %ptr) #0 { ; CHECK-NEXT: s_cbranch_execz .LBB19_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -871,6 +891,7 @@ define void @flat_atomic_cmpxchg_i64_ret_av_v__av(ptr %ptr) #0 { ; CHECK-NEXT: s_cbranch_execz .LBB20_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -927,6 +948,7 @@ define void @flat_atomic_cmpxchg_i64_ret_v_av__av(ptr %ptr) #0 { ; CHECK-NEXT: s_cbranch_execz .LBB21_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -987,6 +1009,7 @@ define void @flat_atomic_cmpxchg_i64_ret_av_a__av(ptr %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -1048,6 +1071,7 @@ define void @flat_atomic_cmpxchg_i64_ret_a_av__av(ptr %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index b5f952b0bb00..0a300674df50 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -16,6 +16,7 @@ define void @flat_atomic_xchg_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -35,6 +36,7 @@ define void @flat_atomic_xchg_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -60,6 +62,7 @@ define void @flat_atomic_xchg_i32_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -78,6 +81,7 @@ define void @flat_atomic_xchg_i32_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -101,6 +105,7 @@ define void @flat_atomic_xchg_i32_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -118,6 +123,7 @@ define void @flat_atomic_xchg_i32_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -142,6 +148,7 @@ define void @flat_atomic_xchg_i32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -158,6 +165,7 @@ define void @flat_atomic_xchg_i32_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -181,6 +189,7 @@ define void @flat_atomic_xchg_i32_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -197,6 +206,7 @@ define void @flat_atomic_xchg_i32_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -220,6 +230,7 @@ define void @flat_atomic_xchg_i32_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -237,6 +248,7 @@ define void @flat_atomic_xchg_i32_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -262,6 +274,7 @@ define void @flat_atomic_xchg_i32_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -280,6 +293,7 @@ define void @flat_atomic_xchg_i32_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -303,6 +317,7 @@ define void @flat_atomic_xchg_i32_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -319,6 +334,7 @@ define void @flat_atomic_xchg_i32_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -577,6 +593,7 @@ define void @flat_atomic_xchg_i32_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap v[0:1], a0 offset:40 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -590,6 +607,7 @@ define void @flat_atomic_xchg_i32_noret_a(ptr %ptr) #0 { ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap v[0:1], a0 offset:40 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -608,6 +626,7 @@ define void @flat_atomic_xchg_i32_noret_av(ptr %ptr) #0 { ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap v[0:1], v2 offset:40 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -621,6 +640,7 @@ define void @flat_atomic_xchg_i32_noret_av(ptr %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap v[0:1], v2 offset:40 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -655,6 +675,7 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB11_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -701,6 +722,7 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB11_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -753,6 +775,7 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB12_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -797,6 +820,7 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB12_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -845,6 +869,7 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB13_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -889,6 +914,7 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB13_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -939,6 +965,7 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB14_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -981,6 +1008,7 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB14_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1029,6 +1057,7 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB15_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1071,6 +1100,7 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB15_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1119,6 +1149,7 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB16_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1163,6 +1194,7 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB16_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1215,6 +1247,7 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB17_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1259,6 +1292,7 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB17_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1307,6 +1341,7 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB18_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1349,6 +1384,7 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB18_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1399,6 +1435,7 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], a[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1435,6 +1472,7 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 { ; GFX950-NEXT: s_setpc_b64 s[30:31] ; GFX950-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], a[0:1] sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1476,6 +1514,7 @@ define void @flat_atomic_xchg_i64_noret_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1511,6 +1550,7 @@ define void @flat_atomic_xchg_i64_noret_av(ptr %ptr) #0 { ; GFX950-NEXT: s_setpc_b64 s[30:31] ; GFX950-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1551,6 +1591,7 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1582,6 +1623,7 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1620,6 +1662,7 @@ define void @flat_atomic_xor_expansion_i32_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1650,6 +1693,7 @@ define void @flat_atomic_xor_expansion_i32_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1686,6 +1730,7 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1716,6 +1761,7 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1753,6 +1799,7 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1782,6 +1829,7 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1818,6 +1866,7 @@ define void @flat_atomic_xor_expansion_i32_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1847,6 +1896,7 @@ define void @flat_atomic_xor_expansion_i32_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1883,6 +1933,7 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1913,6 +1964,7 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1951,6 +2003,7 @@ define void @flat_atomic_xor_expansion_i32_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1981,6 +2034,7 @@ define void @flat_atomic_xor_expansion_i32_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2017,6 +2071,7 @@ define void @flat_atomic_xor_expansion_i32_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2046,6 +2101,7 @@ define void @flat_atomic_xor_expansion_i32_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2140,6 +2196,7 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v0, v1, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2284,6 +2341,7 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v0, v1, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2380,6 +2438,7 @@ define void @flat_atomic_xor_expansion_i32_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2407,6 +2466,7 @@ define void @flat_atomic_xor_expansion_i32_noret_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2438,6 +2498,7 @@ define void @flat_atomic_xor_expansion_i32_noret_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2464,6 +2525,7 @@ define void @flat_atomic_xor_expansion_i32_noret_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2510,6 +2572,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2572,6 +2635,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2639,6 +2703,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2697,6 +2762,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2758,6 +2824,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2818,6 +2885,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2883,6 +2951,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2939,6 +3008,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -3000,6 +3070,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -3056,6 +3127,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -3117,6 +3189,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -3177,6 +3250,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -3244,6 +3318,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -3302,6 +3377,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -3363,6 +3439,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -3419,6 +3496,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -3486,6 +3564,7 @@ define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -3544,6 +3623,7 @@ define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -3603,6 +3683,7 @@ define void @flat_atomic_xor_expansion_i64_noret_av(ptr %ptr) #0 { ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -3659,6 +3740,7 @@ define void @flat_atomic_xor_expansion_i64_noret_av(ptr %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -3722,6 +3804,7 @@ define void @flat_atomic_xor_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3763,6 +3846,7 @@ define void @flat_atomic_xor_i32_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3801,6 +3885,7 @@ define void @flat_atomic_xor_i32_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3839,6 +3924,7 @@ define void @flat_atomic_xor_i32_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3876,6 +3962,7 @@ define void @flat_atomic_xor_i32_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3914,6 +4001,7 @@ define void @flat_atomic_xor_i32_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3955,6 +4043,7 @@ define void @flat_atomic_xor_i32_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3992,6 +4081,7 @@ define void @flat_atomic_xor_i32_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -4259,6 +4349,7 @@ define void @flat_atomic_xor_i32_noret_a(ptr %ptr) #0 { ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor v[0:1], a0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -4288,6 +4379,7 @@ define void @flat_atomic_xor_i32_noret_av(ptr %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor v[0:1], v2 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -4366,6 +4458,7 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB53_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -4461,6 +4554,7 @@ define void @flat_atomic_xor_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB54_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -4552,6 +4646,7 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB55_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -4643,6 +4738,7 @@ define void @flat_atomic_xor_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB56_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -4730,6 +4826,7 @@ define void @flat_atomic_xor_i64_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB57_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -4821,6 +4918,7 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB58_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -4916,6 +5014,7 @@ define void @flat_atomic_xor_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB59_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -5003,6 +5102,7 @@ define void @flat_atomic_xor_i64_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB60_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -5098,6 +5198,7 @@ define void @flat_atomic_xor_i64_noret_a(ptr %ptr) #0 { ; GFX950-NEXT: s_setpc_b64 s[30:31] ; GFX950-NEXT: .LBB61_3: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], a[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -5183,6 +5284,7 @@ define void @flat_atomic_xor_i64_noret_av(ptr %ptr) #0 { ; GFX950-NEXT: s_setpc_b64 s[30:31] ; GFX950-NEXT: .LBB62_3: ; %atomicrmw.global ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll index 6f1cb79e6642..e03728da8b00 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll @@ -16,6 +16,7 @@ define void @global_atomic_cmpxchg_i32_ret_av_av__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -44,6 +45,7 @@ define void @global_atomic_cmpxchg_i32_ret_av_av__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -72,6 +74,7 @@ define void @global_atomic_cmpxchg_i32_ret_av_av__a(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -103,6 +106,7 @@ define void @global_atomic_cmpxchg_i32_ret_a_a__a(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -134,6 +138,7 @@ define void @global_atomic_cmpxchg_i32_ret_a_a__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -165,6 +170,7 @@ define void @global_atomic_cmpxchg_i32_ret_v_a__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -194,6 +200,7 @@ define void @global_atomic_cmpxchg_i32_ret_a_v__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -222,6 +229,7 @@ define void @global_atomic_cmpxchg_i32_ret_v_v__a(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -251,6 +259,7 @@ define void @global_atomic_cmpxchg_i32_ret_av_v__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -279,6 +288,7 @@ define void @global_atomic_cmpxchg_i32_ret_v_av__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -309,6 +319,7 @@ define void @global_atomic_cmpxchg_i32_ret_av_a__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -339,6 +350,7 @@ define void @global_atomic_cmpxchg_i32_ret_a_av__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -371,6 +383,7 @@ define void @global_atomic_cmpxchg_i64_ret_av_av__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -399,6 +412,7 @@ define void @global_atomic_cmpxchg_i64_ret_av_av__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -427,6 +441,7 @@ define void @global_atomic_cmpxchg_i64_ret_av_av__a(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -461,6 +476,7 @@ define void @global_atomic_cmpxchg_i64_ret_a_a__a(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -495,6 +511,7 @@ define void @global_atomic_cmpxchg_i64_ret_a_a__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -525,6 +542,7 @@ define void @global_atomic_cmpxchg_i64_ret_v_a__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v[4:5] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -555,6 +573,7 @@ define void @global_atomic_cmpxchg_i64_ret_a_v__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -583,6 +602,7 @@ define void @global_atomic_cmpxchg_i64_ret_v_v__a(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -613,6 +633,7 @@ define void @global_atomic_cmpxchg_i64_ret_av_v__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -641,6 +662,7 @@ define void @global_atomic_cmpxchg_i64_ret_v_av__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -671,6 +693,7 @@ define void @global_atomic_cmpxchg_i64_ret_av_a__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v[4:5] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 @@ -701,6 +724,7 @@ define void @global_atomic_cmpxchg_i64_ret_a_av__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_wbl2 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_invl2 diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll index b6fe0c756a10..76ef16ad3346 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll @@ -16,6 +16,7 @@ define void @global_atomic_xchg_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -35,6 +36,7 @@ define void @global_atomic_xchg_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -60,6 +62,7 @@ define void @global_atomic_xchg_i32_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -78,6 +81,7 @@ define void @global_atomic_xchg_i32_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -101,6 +105,7 @@ define void @global_atomic_xchg_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -118,6 +123,7 @@ define void @global_atomic_xchg_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -142,6 +148,7 @@ define void @global_atomic_xchg_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -158,6 +165,7 @@ define void @global_atomic_xchg_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -181,6 +189,7 @@ define void @global_atomic_xchg_i32_ret_av_v(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -197,6 +206,7 @@ define void @global_atomic_xchg_i32_ret_av_v(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -220,6 +230,7 @@ define void @global_atomic_xchg_i32_ret_av_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -237,6 +248,7 @@ define void @global_atomic_xchg_i32_ret_av_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -262,6 +274,7 @@ define void @global_atomic_xchg_i32_ret_a_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -280,6 +293,7 @@ define void @global_atomic_xchg_i32_ret_a_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -303,6 +317,7 @@ define void @global_atomic_xchg_i32_ret_v_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -319,6 +334,7 @@ define void @global_atomic_xchg_i32_ret_v_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -577,6 +593,7 @@ define void @global_atomic_xchg_i32_noret_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap v[0:1], a0, off offset:40 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -590,6 +607,7 @@ define void @global_atomic_xchg_i32_noret_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap v[0:1], a0, off offset:40 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -608,6 +626,7 @@ define void @global_atomic_xchg_i32_noret_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap v[0:1], v2, off offset:40 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -621,6 +640,7 @@ define void @global_atomic_xchg_i32_noret_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap v[0:1], v2, off offset:40 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -646,6 +666,7 @@ define void @global_atomic_xchg_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -667,6 +688,7 @@ define void @global_atomic_xchg_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -694,6 +716,7 @@ define void @global_atomic_xchg_i64_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -713,6 +736,7 @@ define void @global_atomic_xchg_i64_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -736,6 +760,7 @@ define void @global_atomic_xchg_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -754,6 +779,7 @@ define void @global_atomic_xchg_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -779,6 +805,7 @@ define void @global_atomic_xchg_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -795,6 +822,7 @@ define void @global_atomic_xchg_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -818,6 +846,7 @@ define void @global_atomic_xchg_i64_ret_av_v(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -834,6 +863,7 @@ define void @global_atomic_xchg_i64_ret_av_v(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -857,6 +887,7 @@ define void @global_atomic_xchg_i64_ret_av_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -875,6 +906,7 @@ define void @global_atomic_xchg_i64_ret_av_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -902,6 +934,7 @@ define void @global_atomic_xchg_i64_ret_a_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -921,6 +954,7 @@ define void @global_atomic_xchg_i64_ret_a_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -944,6 +978,7 @@ define void @global_atomic_xchg_i64_ret_v_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -960,6 +995,7 @@ define void @global_atomic_xchg_i64_ret_v_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -982,6 +1018,7 @@ define void @global_atomic_xchg_i64_noret_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], a[0:1], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -995,6 +1032,7 @@ define void @global_atomic_xchg_i64_noret_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap_x2 v[0:1], a[0:1], off sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1013,6 +1051,7 @@ define void @global_atomic_xchg_i64_noret_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1026,6 +1065,7 @@ define void @global_atomic_xchg_i64_noret_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1056,6 +1096,7 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1087,6 +1128,7 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1125,6 +1167,7 @@ define void @global_atomic_xor_expansion_i32_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1155,6 +1198,7 @@ define void @global_atomic_xor_expansion_i32_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1191,6 +1235,7 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1221,6 +1266,7 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1258,6 +1304,7 @@ define void @global_atomic_xor_expansion_i32_ret_av_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1287,6 +1334,7 @@ define void @global_atomic_xor_expansion_i32_ret_av_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1323,6 +1371,7 @@ define void @global_atomic_xor_expansion_i32_ret_av_v(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1352,6 +1401,7 @@ define void @global_atomic_xor_expansion_i32_ret_av_v(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1388,6 +1438,7 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1418,6 +1469,7 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1456,6 +1508,7 @@ define void @global_atomic_xor_expansion_i32_ret_a_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1486,6 +1539,7 @@ define void @global_atomic_xor_expansion_i32_ret_a_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1522,6 +1576,7 @@ define void @global_atomic_xor_expansion_i32_ret_v_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1551,6 +1606,7 @@ define void @global_atomic_xor_expansion_i32_ret_v_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1645,6 +1701,7 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v0, v1, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1789,6 +1846,7 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v0, v1, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1885,6 +1943,7 @@ define void @global_atomic_xor_expansion_i32_noret_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1912,6 +1971,7 @@ define void @global_atomic_xor_expansion_i32_noret_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -1943,6 +2003,7 @@ define void @global_atomic_xor_expansion_i32_noret_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1969,6 +2030,7 @@ define void @global_atomic_xor_expansion_i32_noret_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2008,6 +2070,7 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2042,6 +2105,7 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2083,6 +2147,7 @@ define void @global_atomic_xor_expansion_i64_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2115,6 +2180,7 @@ define void @global_atomic_xor_expansion_i64_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2152,6 +2218,7 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2184,6 +2251,7 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2223,6 +2291,7 @@ define void @global_atomic_xor_expansion_i64_ret_av_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2253,6 +2322,7 @@ define void @global_atomic_xor_expansion_i64_ret_av_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2290,6 +2360,7 @@ define void @global_atomic_xor_expansion_i64_ret_av_v(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2320,6 +2391,7 @@ define void @global_atomic_xor_expansion_i64_ret_av_v(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2357,6 +2429,7 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2389,6 +2462,7 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2430,6 +2504,7 @@ define void @global_atomic_xor_expansion_i64_ret_a_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2462,6 +2537,7 @@ define void @global_atomic_xor_expansion_i64_ret_a_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2499,6 +2575,7 @@ define void @global_atomic_xor_expansion_i64_ret_v_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2529,6 +2606,7 @@ define void @global_atomic_xor_expansion_i64_ret_v_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2567,6 +2645,7 @@ define void @global_atomic_xor_expansion_i64_noret_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2596,6 +2675,7 @@ define void @global_atomic_xor_expansion_i64_noret_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2628,6 +2708,7 @@ define void @global_atomic_xor_expansion_i64_noret_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2655,6 +2736,7 @@ define void @global_atomic_xor_expansion_i64_noret_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 @@ -2703,6 +2785,7 @@ define void @global_atomic_xor_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -2744,6 +2827,7 @@ define void @global_atomic_xor_i32_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -2782,6 +2866,7 @@ define void @global_atomic_xor_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -2820,6 +2905,7 @@ define void @global_atomic_xor_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -2857,6 +2943,7 @@ define void @global_atomic_xor_i32_ret_av_v(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -2895,6 +2982,7 @@ define void @global_atomic_xor_i32_ret_av_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -2936,6 +3024,7 @@ define void @global_atomic_xor_i32_ret_a_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -2973,6 +3062,7 @@ define void @global_atomic_xor_i32_ret_v_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3240,6 +3330,7 @@ define void @global_atomic_xor_i32_noret_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor v[0:1], a0, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3269,6 +3360,7 @@ define void @global_atomic_xor_i32_noret_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor v[0:1], v2, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3313,6 +3405,7 @@ define void @global_atomic_xor_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3357,6 +3450,7 @@ define void @global_atomic_xor_i64_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3396,6 +3490,7 @@ define void @global_atomic_xor_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3435,6 +3530,7 @@ define void @global_atomic_xor_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3472,6 +3568,7 @@ define void @global_atomic_xor_i64_ret_av_v(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3511,6 +3608,7 @@ define void @global_atomic_xor_i64_ret_av_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3555,6 +3653,7 @@ define void @global_atomic_xor_i64_ret_a_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3592,6 +3691,7 @@ define void @global_atomic_xor_i64_ret_v_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3625,6 +3725,7 @@ define void @global_atomic_xor_i64_noret_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor_x2 v[0:1], a[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 @@ -3654,6 +3755,7 @@ define void @global_atomic_xor_i64_noret_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index 18e2ae291940..ae42404fd381 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -46,6 +46,7 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB0_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -84,6 +85,7 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 27308e82a335..029512631b36 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -35,6 +35,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -219,6 +220,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -402,6 +404,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 ; GFX942-NEXT: v_readfirstlane_b32 s5, v1 @@ -790,6 +793,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -990,6 +994,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1183,6 +1188,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1400,6 +1406,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1617,6 +1624,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1857,6 +1865,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2098,6 +2107,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2369,6 +2379,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX942-NEXT: v_mov_b32_e32 v6, v5 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 ; GFX942-NEXT: v_readfirstlane_b32 s5, v1 @@ -2847,6 +2858,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3107,6 +3119,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3431,6 +3444,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: v_and_or_b32 v2, v3, s7, v2 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3862,6 +3876,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: v_and_or_b32 v2, v3, s7, v2 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4371,6 +4386,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -5103,6 +5119,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -5620,6 +5637,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0 @@ -6220,7 +6238,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 @@ -6896,6 +6914,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7130,6 +7149,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7360,6 +7380,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 ; GFX942-NEXT: v_readfirstlane_b32 s5, v1 @@ -7817,6 +7838,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8067,6 +8089,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8307,6 +8330,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8557,6 +8581,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8825,6 +8850,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] ; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 @@ -9261,6 +9287,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 @@ -9707,6 +9734,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 ; GFX942-NEXT: v_add_f32_e32 v7, v7, v5 @@ -10416,6 +10444,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] ; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 @@ -10852,6 +10881,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 @@ -11271,6 +11301,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] ; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 @@ -11707,6 +11738,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 @@ -12124,6 +12156,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 @@ -12524,6 +12557,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -12583,6 +12617,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 5b5fb8f3a166..1b957444869e 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -47,6 +47,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -216,6 +217,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX942-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -418,6 +420,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -768,6 +771,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1018,6 +1022,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1203,6 +1208,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1384,6 +1390,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1604,6 +1611,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX942-NEXT: v_mov_b32_e32 v6, v5 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 ; GFX942-NEXT: v_readfirstlane_b32 s5, v1 @@ -1984,6 +1992,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2249,6 +2258,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2518,6 +2528,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2970,6 +2981,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3501,6 +3513,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -4247,6 +4260,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -4766,6 +4780,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0 @@ -5368,7 +5383,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 @@ -6077,6 +6092,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX942-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-NEXT: v_pk_max_f16 v0, v5, v5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] @@ -6377,6 +6393,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] @@ -6720,6 +6737,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_pk_max_f16 v8, v6, v5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7345,6 +7363,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] ; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 @@ -7865,6 +7884,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 @@ -8455,6 +8475,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 ; GFX942-NEXT: v_max_f32_e32 v7, v7, v5 @@ -9157,6 +9178,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -9207,6 +9229,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index c1c512b9c0a1..da140ac4bf59 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -47,6 +47,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -216,6 +217,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX942-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -418,6 +420,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -768,6 +771,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1018,6 +1022,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1203,6 +1208,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1384,6 +1390,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1604,6 +1611,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX942-NEXT: v_mov_b32_e32 v6, v5 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 ; GFX942-NEXT: v_readfirstlane_b32 s5, v1 @@ -1984,6 +1992,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2249,6 +2258,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s16 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2518,6 +2528,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2970,6 +2981,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3501,6 +3513,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -4247,6 +4260,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -4766,6 +4780,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0 @@ -5368,7 +5383,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 @@ -6077,6 +6092,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX942-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-NEXT: v_pk_max_f16 v0, v5, v5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] @@ -6377,6 +6393,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] @@ -6720,6 +6737,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_pk_min_f16 v8, v6, v5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7345,6 +7363,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] ; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 @@ -7865,6 +7884,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 @@ -8455,6 +8475,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 ; GFX942-NEXT: v_min_f32_e32 v7, v7, v5 @@ -9157,6 +9178,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -9207,6 +9229,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index bd272f5d210b..91b0f2e55aad 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -32,6 +32,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -208,6 +209,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -397,6 +399,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -590,6 +593,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -796,6 +800,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1016,6 +1021,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1237,6 +1243,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1304,6 +1311,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_cbranch_execz .LBB6_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1426,6 +1434,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1492,6 +1501,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_cbranch_execz .LBB7_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1645,6 +1655,7 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1809,6 +1820,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1960,6 +1972,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2177,6 +2190,7 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2345,6 +2359,7 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2521,6 +2536,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2710,6 +2726,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2903,6 +2920,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3109,6 +3127,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3329,6 +3348,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3550,6 +3570,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -3617,6 +3638,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: s_cbranch_execz .LBB18_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -3739,6 +3761,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -3805,6 +3828,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_cbranch_execz .LBB19_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -3959,6 +3983,7 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -4026,6 +4051,7 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: s_cbranch_execz .LBB20_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -4148,6 +4174,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -4214,6 +4241,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_cbranch_execz .LBB21_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -4367,6 +4395,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4531,6 +4560,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4688,6 +4718,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4852,6 +4883,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5009,6 +5041,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5185,6 +5218,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5391,6 +5425,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5539,6 +5574,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5752,6 +5788,7 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_cbranch_execz .LBB30_5 ; GFX942-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6193,6 +6230,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_cbranch_execz .LBB31_5 ; GFX942-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6654,6 +6692,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_cbranch_execz .LBB32_5 ; GFX942-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7105,6 +7144,7 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_cbranch_execz .LBB33_5 ; GFX942-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7539,6 +7579,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_cbranch_execz .LBB34_5 ; GFX942-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7988,6 +8029,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_cbranch_execz .LBB35_5 ; GFX942-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8462,6 +8504,7 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8834,6 +8877,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -9215,6 +9259,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -9588,6 +9633,7 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -9948,6 +9994,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10317,6 +10364,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10651,6 +10699,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX942-NEXT: v_add_f16_e32 v3, v5, v2 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10925,6 +10974,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-NEXT: v_add_f16_e32 v3, v5, v2 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11241,6 +11291,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -11390,6 +11441,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -11622,6 +11674,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -11766,6 +11819,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -12026,6 +12080,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12482,6 +12537,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12948,6 +13004,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -13409,6 +13466,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -13861,6 +13919,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14282,6 +14341,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14649,6 +14709,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15037,6 +15098,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15485,6 +15547,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -15670,6 +15733,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -15950,6 +16014,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -16129,6 +16194,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -16284,6 +16350,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16466,6 +16533,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16656,6 +16724,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16853,6 +16922,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17026,6 +17096,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17209,6 +17280,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17403,6 +17475,7 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -17469,6 +17542,7 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -17593,6 +17667,7 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -17657,6 +17732,7 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -17775,6 +17851,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17957,6 +18034,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18130,6 +18208,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18312,6 +18391,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18489,6 +18569,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18815,6 +18896,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -19149,6 +19231,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -19491,6 +19574,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -19805,6 +19889,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -20129,6 +20214,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -20467,6 +20553,7 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -20637,6 +20724,7 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -20801,6 +20889,7 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -20967,6 +21056,7 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -21124,6 +21214,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -21450,6 +21541,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -21764,6 +21856,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -22090,6 +22183,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 24e98b67cfc7..ddc889f8075d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -40,6 +40,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -182,6 +183,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -336,6 +338,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -494,6 +497,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -634,6 +638,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -787,6 +792,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -948,6 +954,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -995,6 +1002,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1099,6 +1107,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1147,6 +1156,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1248,6 +1258,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1440,6 +1451,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1586,6 +1598,7 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1728,6 +1741,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1882,6 +1896,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2040,6 +2055,7 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2180,6 +2196,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2333,6 +2350,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2494,6 +2512,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2541,6 +2560,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2645,6 +2665,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2693,6 +2714,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2851,6 +2873,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3247,6 +3270,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3659,6 +3683,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4063,6 +4088,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4453,6 +4479,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4858,6 +4885,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5257,6 +5285,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB24_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5680,6 +5709,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB25_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6109,6 +6139,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6504,6 +6535,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6910,6 +6942,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7305,6 +7338,7 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7688,6 +7722,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8082,6 +8117,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8441,6 +8477,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-NEXT: v_max_f16_e32 v2, v2, v4 ; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8738,6 +8775,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX942-NEXT: v_max_f16_e32 v2, v2, v4 ; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -9071,6 +9109,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -9231,6 +9270,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -9477,6 +9517,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -9632,6 +9673,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -9896,6 +9938,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10353,6 +10396,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10820,6 +10864,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11276,6 +11321,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11719,6 +11765,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12172,6 +12219,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12594,6 +12642,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12962,6 +13011,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -13363,6 +13413,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -13548,6 +13599,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -13829,6 +13881,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -14008,6 +14061,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -14192,6 +14246,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14422,6 +14477,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14663,6 +14719,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14906,6 +14963,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15126,6 +15184,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15359,6 +15418,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15601,6 +15661,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -15681,6 +15742,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -15838,6 +15900,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -15916,6 +15979,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -16159,6 +16223,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16608,6 +16673,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17068,6 +17134,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX942-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v2, v0, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17528,6 +17595,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17960,6 +18028,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18405,6 +18474,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18866,6 +18936,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -19044,6 +19115,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -19320,6 +19392,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -19493,6 +19566,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index c3c46ddeadcf..1b3fd173ab7b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -40,6 +40,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -182,6 +183,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -336,6 +338,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -494,6 +497,7 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -634,6 +638,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -787,6 +792,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -948,6 +954,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -995,6 +1002,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1099,6 +1107,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1147,6 +1156,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1248,6 +1258,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1440,6 +1451,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1586,6 +1598,7 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1728,6 +1741,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1882,6 +1896,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2040,6 +2055,7 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2180,6 +2196,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2333,6 +2350,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2494,6 +2512,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2541,6 +2560,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2645,6 +2665,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2693,6 +2714,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2851,6 +2873,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3247,6 +3270,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3659,6 +3683,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4063,6 +4088,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4453,6 +4479,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4858,6 +4885,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5257,6 +5285,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB24_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5680,6 +5709,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB25_3: ; %atomicrmw.global ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6109,6 +6139,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6504,6 +6535,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6910,6 +6942,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7305,6 +7338,7 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7688,6 +7722,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8082,6 +8117,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8441,6 +8477,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-NEXT: v_min_f16_e32 v2, v2, v4 ; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8738,6 +8775,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX942-NEXT: v_min_f16_e32 v2, v2, v4 ; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -9071,6 +9109,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -9231,6 +9270,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -9477,6 +9517,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -9632,6 +9673,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -9896,6 +9938,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10353,6 +10396,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10820,6 +10864,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11276,6 +11321,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11719,6 +11765,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12172,6 +12219,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12594,6 +12642,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12962,6 +13011,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -13363,6 +13413,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -13548,6 +13599,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -13829,6 +13881,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -14008,6 +14061,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -14192,6 +14246,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14422,6 +14477,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14663,6 +14719,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14906,6 +14963,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15126,6 +15184,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15359,6 +15418,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15601,6 +15661,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -15681,6 +15742,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -15838,6 +15900,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -15916,6 +15979,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -16159,6 +16223,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16608,6 +16673,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17068,6 +17134,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX942-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v2, v0, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17528,6 +17595,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17960,6 +18028,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18405,6 +18474,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18866,6 +18936,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -19044,6 +19115,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -19320,6 +19392,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -19493,6 +19566,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 5b1f19ab89cd..53612827cd2a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -55,6 +55,7 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -250,6 +251,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -455,6 +457,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v6, v7, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -661,6 +664,7 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -846,6 +850,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1044,6 +1049,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1252,6 +1258,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1326,6 +1333,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1452,6 +1460,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1523,6 +1532,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1652,6 +1662,7 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1847,6 +1858,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2052,6 +2064,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v6, v7, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2258,6 +2271,7 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2443,6 +2457,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2641,6 +2656,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2849,6 +2865,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2923,6 +2940,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -3049,6 +3067,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -3120,6 +3139,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -3281,6 +3301,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3707,6 +3728,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4153,6 +4175,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4590,6 +4613,7 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5011,6 +5035,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5447,6 +5472,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5909,6 +5935,7 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6281,6 +6308,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6662,6 +6690,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7035,6 +7064,7 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7395,6 +7425,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7764,6 +7795,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8101,6 +8133,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX942-NEXT: v_sub_f16_e32 v3, v5, v2 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8378,6 +8411,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX942-NEXT: v_sub_f16_e32 v3, v5, v2 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8688,6 +8722,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -8837,6 +8872,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -9069,6 +9105,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -9213,6 +9250,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -9473,6 +9511,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -9929,6 +9968,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10395,6 +10435,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10850,6 +10891,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11292,6 +11334,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11744,6 +11787,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12165,6 +12209,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12532,6 +12577,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12932,6 +12978,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -13117,6 +13164,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -13397,6 +13445,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -13576,6 +13625,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -13754,6 +13804,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -13967,6 +14018,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14191,6 +14243,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v6, v7, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14416,6 +14469,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14617,6 +14671,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14831,6 +14886,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15055,6 +15111,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -15129,6 +15186,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -15274,6 +15332,7 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -15345,6 +15404,7 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -15582,6 +15642,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16031,6 +16092,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16491,6 +16553,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX942-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v2, v0, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16951,6 +17014,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17383,6 +17447,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17828,6 +17893,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18289,6 +18355,7 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -18467,6 +18534,7 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -18743,6 +18811,7 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -18916,6 +18985,7 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index dffdeebf1220..54fb38ba877a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -25,6 +25,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -37,6 +38,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -66,6 +68,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 offset:2047 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -78,6 +81,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 offset:2047 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -111,6 +115,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -126,6 +131,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -155,6 +161,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -167,6 +174,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -196,6 +204,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:2048 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -208,6 +217,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voff ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 offset:2048 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -241,6 +251,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %v ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -256,6 +267,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %v ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -317,6 +329,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -331,6 +344,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -386,6 +400,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:42 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -400,6 +415,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 offset:42 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -456,6 +472,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -470,6 +487,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -524,6 +542,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 offset:42 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -538,6 +557,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 offset:42 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -671,6 +691,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: s_branch .LBB10_5 ; GFX950-SDAG-NEXT: .LBB10_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -712,6 +733,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX950-GISEL-NEXT: s_branch .LBB10_5 ; GFX950-GISEL-NEXT: .LBB10_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -859,6 +881,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX950-SDAG-NEXT: s_branch .LBB11_5 ; GFX950-SDAG-NEXT: .LBB11_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -903,6 +926,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX950-GISEL-NEXT: s_branch .LBB11_5 ; GFX950-GISEL-NEXT: .LBB11_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -1023,6 +1047,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB12_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -1056,6 +1081,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB12_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -1177,6 +1203,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB13_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -1214,6 +1241,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB13_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -1256,6 +1284,7 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_add v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -1268,6 +1297,7 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_add v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -1300,6 +1330,7 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_add v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -1315,6 +1346,7 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_add v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -1345,6 +1377,7 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_add v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -1357,6 +1390,7 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_add v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -1388,6 +1422,7 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_add v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -1403,6 +1438,7 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_add v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -1527,6 +1563,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: s_branch .LBB18_5 ; GFX950-SDAG-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -1569,6 +1606,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-GISEL-NEXT: s_branch .LBB18_5 ; GFX950-GISEL-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -1719,6 +1757,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: s_branch .LBB19_5 ; GFX950-SDAG-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -1764,6 +1803,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-GISEL-NEXT: s_branch .LBB19_5 ; GFX950-GISEL-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -1893,6 +1933,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -1929,6 +1970,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -2061,6 +2103,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -2101,6 +2144,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -2148,6 +2192,7 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_sub v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -2160,6 +2205,7 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_sub v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -2192,6 +2238,7 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_sub v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -2207,6 +2254,7 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_sub v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -2237,6 +2285,7 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_sub v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -2249,6 +2298,7 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_sub v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -2280,6 +2330,7 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_sub v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -2295,6 +2346,7 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_sub v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -2419,6 +2471,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: s_branch .LBB26_5 ; GFX950-SDAG-NEXT: .LBB26_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -2463,6 +2516,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-GISEL-NEXT: s_branch .LBB26_5 ; GFX950-GISEL-NEXT: .LBB26_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -2613,6 +2667,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: s_branch .LBB27_5 ; GFX950-SDAG-NEXT: .LBB27_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -2660,6 +2715,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-GISEL-NEXT: s_branch .LBB27_5 ; GFX950-GISEL-NEXT: .LBB27_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -2789,6 +2845,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB28_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -2827,6 +2884,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB28_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -2959,6 +3017,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB29_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -3001,6 +3060,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB29_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -3048,6 +3108,7 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_and v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -3060,6 +3121,7 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_and v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -3092,6 +3154,7 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_and v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -3107,6 +3170,7 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_and v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -3137,6 +3201,7 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_and v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -3149,6 +3214,7 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_and v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -3180,6 +3246,7 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_and v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -3195,6 +3262,7 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_and v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -3321,6 +3389,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: s_branch .LBB34_5 ; GFX950-SDAG-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -3364,6 +3433,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-GISEL-NEXT: s_branch .LBB34_5 ; GFX950-GISEL-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -3515,6 +3585,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: s_branch .LBB35_5 ; GFX950-SDAG-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -3561,6 +3632,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-GISEL-NEXT: s_branch .LBB35_5 ; GFX950-GISEL-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -3691,6 +3763,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB36_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -3728,6 +3801,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB36_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -3861,6 +3935,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB37_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -3902,6 +3977,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB37_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -3948,6 +4024,7 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_or v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -3960,6 +4037,7 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i3 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_or v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -3992,6 +4070,7 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_or v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -4007,6 +4086,7 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voff ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_or v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -4037,6 +4117,7 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_or v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -4049,6 +4130,7 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_or v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -4080,6 +4162,7 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_or v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -4095,6 +4178,7 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_or v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -4221,6 +4305,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX950-SDAG-NEXT: s_branch .LBB42_5 ; GFX950-SDAG-NEXT: .LBB42_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -4264,6 +4349,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX950-GISEL-NEXT: s_branch .LBB42_5 ; GFX950-GISEL-NEXT: .LBB42_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -4415,6 +4501,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX950-SDAG-NEXT: s_branch .LBB43_5 ; GFX950-SDAG-NEXT: .LBB43_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -4461,6 +4548,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX950-GISEL-NEXT: s_branch .LBB43_5 ; GFX950-GISEL-NEXT: .LBB43_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -4591,6 +4679,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB44_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -4628,6 +4717,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB44_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -4761,6 +4851,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB45_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -4802,6 +4893,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB45_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -4848,6 +4940,7 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -4860,6 +4953,7 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_xor v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -4892,6 +4986,7 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -4907,6 +5002,7 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_xor v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -4937,6 +5033,7 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_xor v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -4949,6 +5046,7 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_xor v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -4980,6 +5078,7 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_xor v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -4995,6 +5094,7 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_xor v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -5121,6 +5221,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: s_branch .LBB50_5 ; GFX950-SDAG-NEXT: .LBB50_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -5164,6 +5265,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-GISEL-NEXT: s_branch .LBB50_5 ; GFX950-GISEL-NEXT: .LBB50_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -5315,6 +5417,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: s_branch .LBB51_5 ; GFX950-SDAG-NEXT: .LBB51_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -5361,6 +5464,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-GISEL-NEXT: s_branch .LBB51_5 ; GFX950-GISEL-NEXT: .LBB51_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -5491,6 +5595,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB52_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -5528,6 +5633,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB52_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -5661,6 +5767,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB53_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 @@ -5702,6 +5809,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB53_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 @@ -9093,6 +9201,7 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffse ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 @@ -9106,6 +9215,7 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffse ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 @@ -9140,6 +9250,7 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 @@ -9156,6 +9267,7 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 @@ -9188,6 +9300,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffs ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 @@ -9201,6 +9314,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffs ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 @@ -9233,6 +9347,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 @@ -9249,6 +9364,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 @@ -9379,6 +9495,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX950-SDAG-NEXT: s_branch .LBB90_5 ; GFX950-SDAG-NEXT: .LBB90_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] sc0 sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 @@ -9426,6 +9543,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX950-GISEL-NEXT: s_branch .LBB90_5 ; GFX950-GISEL-NEXT: .LBB90_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] sc0 sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 @@ -9584,6 +9702,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX950-SDAG-NEXT: s_branch .LBB91_5 ; GFX950-SDAG-NEXT: .LBB91_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] sc0 sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 @@ -9634,6 +9753,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX950-GISEL-NEXT: s_branch .LBB91_5 ; GFX950-GISEL-NEXT: .LBB91_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] sc0 sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 @@ -9771,6 +9891,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB92_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:7] sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 @@ -9812,6 +9933,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB92_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:9] sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 @@ -9951,6 +10073,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB93_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:7] sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 @@ -9996,6 +10119,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB93_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:9] sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll index 72f466063024..648ac6c2517e 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll @@ -14,6 +14,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -57,6 +58,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -97,6 +99,7 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 4775978f3ff0..e5b55140b44b 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1483,7 +1483,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1496,7 +1496,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1540,7 +1540,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1586,7 +1586,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1630,7 +1630,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1662,6 +1662,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1673,6 +1674,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1712,6 +1714,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1742,6 +1745,7 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1753,6 +1757,7 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1806,7 +1811,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1840,6 +1845,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1853,6 +1859,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1897,6 +1904,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1930,6 +1938,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1943,6 +1952,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1974,6 +1984,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1985,6 +1996,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2024,6 +2036,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2054,6 +2067,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2066,6 +2080,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2121,6 +2136,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 15e9eda90d04..864df0eda1e4 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -48,6 +48,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -245,6 +246,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -444,6 +446,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -653,6 +656,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -844,6 +848,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1038,6 +1043,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1241,6 +1247,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1290,6 +1297,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1443,6 +1451,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1491,6 +1500,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1639,6 +1649,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1854,6 +1865,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2069,6 +2081,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2277,6 +2290,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2476,6 +2490,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2661,6 +2676,7 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2876,6 +2892,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3084,6 +3101,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3278,6 +3296,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3446,6 +3465,7 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3654,6 +3674,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3867,6 +3888,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4072,6 +4094,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4285,6 +4308,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4490,6 +4514,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4687,6 +4712,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4882,6 +4908,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5065,6 +5092,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5250,6 +5278,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5445,6 +5474,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5610,6 +5640,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5778,6 +5809,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5955,6 +5987,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -5998,6 +6031,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -6143,6 +6177,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -6185,6 +6220,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -6313,6 +6349,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6498,6 +6535,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6666,6 +6704,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6879,6 +6918,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7084,6 +7124,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7267,6 +7308,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7453,6 +7495,7 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7687,6 +7730,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7922,6 +7966,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8159,6 +8204,7 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8375,6 +8421,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8594,6 +8641,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8975,6 +9023,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -9483,6 +9532,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10003,6 +10053,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10510,6 +10561,7 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11002,6 +11054,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11506,6 +11559,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11957,6 +12011,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-NEXT: v_add_f16_e32 v3, v5, v2 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12335,6 +12390,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX942-NEXT: v_add_f16_e32 v3, v5, v2 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12768,6 +12824,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -12917,6 +12974,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -13283,6 +13341,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -13427,6 +13486,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -13827,6 +13887,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14424,6 +14485,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15035,6 +15097,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15631,6 +15694,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16213,6 +16277,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16809,6 +16874,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17351,6 +17417,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17828,6 +17895,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18358,6 +18426,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -18543,6 +18612,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -18966,6 +19036,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -19145,6 +19216,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -19364,6 +19436,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -19596,6 +19669,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -19830,6 +19904,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -20072,6 +20147,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -20280,6 +20356,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -20491,6 +20568,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -20711,6 +20789,7 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -20770,6 +20849,7 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -20948,6 +21028,7 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -21004,6 +21085,7 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -21161,6 +21243,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -21407,6 +21490,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -21641,6 +21725,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -21873,6 +21958,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -22081,6 +22167,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -22327,6 +22414,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -22565,6 +22653,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -22961,6 +23050,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -23359,6 +23449,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -23765,6 +23856,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -24145,6 +24237,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -24528,6 +24621,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -24920,6 +25014,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -25089,6 +25184,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -25321,6 +25417,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -25485,6 +25582,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -25706,6 +25804,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -26102,6 +26201,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -26482,6 +26582,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -26878,6 +26979,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -27258,6 +27360,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -27654,6 +27757,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index 6e1d5293c54e..21762ff4222a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -41,6 +41,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -202,6 +203,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -365,6 +367,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -528,6 +531,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -683,6 +687,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -841,6 +846,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1000,6 +1006,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1045,6 +1052,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1166,6 +1174,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1210,6 +1219,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1326,6 +1336,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1556,6 +1567,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1721,6 +1733,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1882,6 +1895,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2045,6 +2059,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2208,6 +2223,7 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2363,6 +2379,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2521,6 +2538,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2680,6 +2698,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2725,6 +2744,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2846,6 +2866,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2890,6 +2911,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -3021,6 +3043,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3193,6 +3216,7 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3366,6 +3390,7 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3538,6 +3563,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3700,6 +3726,7 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3865,6 +3892,7 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4031,6 +4059,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4278,6 +4307,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4532,6 +4562,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4975,6 +5006,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5431,6 +5463,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5876,6 +5909,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6307,6 +6341,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6750,6 +6785,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7158,6 +7194,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-NEXT: v_max_f16_e32 v2, v2, v4 ; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7494,6 +7531,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX942-NEXT: v_max_f16_e32 v2, v2, v4 ; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7863,6 +7901,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -8023,6 +8062,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -8319,6 +8359,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -8474,6 +8515,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -8787,6 +8829,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -9295,6 +9338,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -9815,6 +9859,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10324,6 +10369,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10818,6 +10864,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11323,6 +11370,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11797,6 +11845,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12207,6 +12256,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12647,6 +12697,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -12832,6 +12883,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -13166,6 +13218,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -13345,6 +13398,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -13581,6 +13635,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -13862,6 +13917,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14145,6 +14201,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14435,6 +14492,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14703,6 +14761,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14974,6 +15033,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15255,6 +15315,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -15334,6 +15395,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -15540,6 +15602,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -15616,6 +15679,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -15905,6 +15969,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16411,6 +16476,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16919,6 +16985,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17431,6 +17498,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17916,6 +17984,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18404,6 +18473,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18906,6 +18976,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -19083,6 +19154,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -19414,6 +19486,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -19585,6 +19658,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 2de8873d185d..ea493405612d 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -41,6 +41,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -202,6 +203,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -365,6 +367,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -528,6 +531,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -683,6 +687,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -841,6 +846,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1000,6 +1006,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1045,6 +1052,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1166,6 +1174,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1210,6 +1219,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1326,6 +1336,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1556,6 +1567,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1721,6 +1733,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1882,6 +1895,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2045,6 +2059,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2208,6 +2223,7 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2363,6 +2379,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2521,6 +2538,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2680,6 +2698,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2725,6 +2744,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -2846,6 +2866,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -2890,6 +2911,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -3021,6 +3043,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3193,6 +3216,7 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3366,6 +3390,7 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3538,6 +3563,7 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3700,6 +3726,7 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3865,6 +3892,7 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4031,6 +4059,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4278,6 +4307,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4532,6 +4562,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4975,6 +5006,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5431,6 +5463,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5876,6 +5909,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6307,6 +6341,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6750,6 +6785,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7158,6 +7194,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-NEXT: v_min_f16_e32 v2, v2, v4 ; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7494,6 +7531,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX942-NEXT: v_min_f16_e32 v2, v2, v4 ; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7863,6 +7901,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -8023,6 +8062,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -8319,6 +8359,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -8474,6 +8515,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -8787,6 +8829,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -9295,6 +9338,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -9815,6 +9859,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10324,6 +10369,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10818,6 +10864,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11323,6 +11370,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11797,6 +11845,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12207,6 +12256,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12647,6 +12697,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -12832,6 +12883,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -13166,6 +13218,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -13345,6 +13398,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -13581,6 +13635,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -13862,6 +13917,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14145,6 +14201,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14435,6 +14492,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14703,6 +14761,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14974,6 +15033,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15255,6 +15315,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -15334,6 +15395,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -15540,6 +15602,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -15616,6 +15679,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -15905,6 +15969,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16411,6 +16476,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16919,6 +16985,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17431,6 +17498,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17916,6 +17984,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18404,6 +18473,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18906,6 +18976,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -19083,6 +19154,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -19414,6 +19486,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -19585,6 +19658,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index e565d70fe93c..748971fa059c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -56,6 +56,7 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -287,6 +288,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -520,6 +522,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -761,6 +764,7 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -981,6 +985,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1204,6 +1209,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -1438,6 +1444,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1511,6 +1518,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1672,6 +1680,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -1741,6 +1750,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -1903,6 +1913,7 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2134,6 +2145,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2367,6 +2379,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2608,6 +2621,7 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -2828,6 +2842,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3051,6 +3066,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -3285,6 +3301,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -3358,6 +3375,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -3519,6 +3537,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -3588,6 +3607,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -3750,6 +3770,7 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4001,6 +4022,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4253,6 +4275,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4507,6 +4530,7 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4736,6 +4760,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -4968,6 +4993,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5281,6 +5307,7 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -5701,6 +5728,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6132,6 +6160,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6555,6 +6584,7 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -6963,6 +6993,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7381,6 +7412,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -7767,6 +7799,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX942-NEXT: v_sub_f16_e32 v3, v5, v2 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8083,6 +8116,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX942-NEXT: v_sub_f16_e32 v3, v5, v2 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -8429,6 +8463,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -8578,6 +8613,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -8860,6 +8896,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -9004,6 +9041,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -9313,6 +9351,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -9819,6 +9858,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10337,6 +10377,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -10844,6 +10885,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11336,6 +11378,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -11839,6 +11882,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12311,6 +12355,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -12719,6 +12764,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -13157,6 +13203,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -13342,6 +13389,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -13674,6 +13722,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -13853,6 +13902,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -14082,6 +14132,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14346,6 +14397,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14612,6 +14664,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -14884,6 +14937,7 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15133,6 +15187,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15385,6 +15440,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -15648,6 +15704,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -15721,6 +15778,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -15915,6 +15973,7 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -15984,6 +16043,7 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -16267,6 +16327,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -16773,6 +16834,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17281,6 +17343,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -17793,6 +17856,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18278,6 +18342,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -18766,6 +18831,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -19268,6 +19334,7 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -19445,6 +19512,7 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -19776,6 +19844,7 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] ; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 @@ -19947,6 +20016,7 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 diff --git a/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll b/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll index d45cde4ca237..864c74d1056d 100644 --- a/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes='require,atomic-expand' < %s | FileCheck --check-prefix=OPT %s @@ -13,7 +13,6 @@ define i32 @global_agent_monotonic_idempotent_or(ptr addrspace(1) %in) { ; OPT-NEXT: entry: ; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("agent-one-as") monotonic, align 4 ; OPT-NEXT: ret i32 [[VAL]] -; entry: %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") monotonic, align 4 ret i32 %val @@ -31,7 +30,6 @@ define i32 @global_agent_acquire_idempotent_or(ptr addrspace(1) %in) { ; OPT-NEXT: entry: ; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("agent-one-as") acquire, align 4 ; OPT-NEXT: ret i32 [[VAL]] -; entry: %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") acquire, align 4 ret i32 %val @@ -43,6 +41,7 @@ define i32 @global_agent_release_idempotent_or(ptr addrspace(1) %in) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_or v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -50,7 +49,6 @@ define i32 @global_agent_release_idempotent_or(ptr addrspace(1) %in) { ; OPT-NEXT: entry: ; OPT-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") release, align 4 ; OPT-NEXT: ret i32 [[VAL]] -; entry: %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") release, align 4 ret i32 %val @@ -62,6 +60,7 @@ define i32 @global_agent_release_idempotent_or_no_remote(ptr addrspace(1) %in) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_or v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -80,6 +79,7 @@ define i32 @global_agent_release_idempotent_or_no_fine_grained(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_or v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -98,6 +98,7 @@ define i32 @global_agent_acquire_release_idempotent_or(ptr addrspace(1) %in) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_or v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -106,7 +107,6 @@ define i32 @global_agent_acquire_release_idempotent_or(ptr addrspace(1) %in) { ; OPT-NEXT: entry: ; OPT-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") acq_rel, align 4 ; OPT-NEXT: ret i32 [[VAL]] -; entry: %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") acq_rel, align 4 ret i32 %val @@ -118,6 +118,7 @@ define i32 @global_agent_acquire_release_idempotent_or__no_fine_grained(ptr addr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_or v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -137,6 +138,7 @@ define i32 @global_agent_seq_cst_idempotent_or(ptr addrspace(1) %in) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_atomic_or v0, v[0:1], v2, off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 @@ -145,7 +147,6 @@ define i32 @global_agent_seq_cst_idempotent_or(ptr addrspace(1) %in) { ; OPT-NEXT: entry: ; OPT-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") seq_cst, align 4 ; OPT-NEXT: ret i32 [[VAL]] -; entry: %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") seq_cst, align 4 ret i32 %val @@ -162,7 +163,6 @@ define i32 @global_agent_monotonic_idempotent_add(ptr addrspace(1) %in) { ; OPT-NEXT: entry: ; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("workgroup") monotonic, align 4 ; OPT-NEXT: ret i32 [[VAL]] -; entry: %val = atomicrmw add ptr addrspace(1) %in, i32 0 syncscope("workgroup") monotonic, align 4 ret i32 %val @@ -179,7 +179,6 @@ define i32 @global_agent_monotonic_idempotent_add__no_fine_grained(ptr addrspace ; OPT-NEXT: entry: ; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; OPT-NEXT: ret i32 [[VAL]] -; entry: %val = atomicrmw add ptr addrspace(1) %in, i32 0 syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory !0 ret i32 %val @@ -196,7 +195,6 @@ define i32 @global_agent_monotonic_idempotent_sub(ptr addrspace(1) %in) { ; OPT-NEXT: entry: ; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("wavefront") monotonic, align 4 ; OPT-NEXT: ret i32 [[VAL]] -; entry: %val = atomicrmw sub ptr addrspace(1) %in, i32 0 syncscope("wavefront") monotonic, align 4 ret i32 %val @@ -213,7 +211,6 @@ define i32 @global_agent_monotonic_idempotent_sub__no_fine_grained(ptr addrspace ; OPT-NEXT: entry: ; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; OPT-NEXT: ret i32 [[VAL]] -; entry: %val = atomicrmw sub ptr addrspace(1) %in, i32 0 syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory !0 ret i32 %val @@ -230,7 +227,6 @@ define i32 @global_system_monotonic_idempotent_xor(ptr addrspace(1) %in) { ; OPT-NEXT: entry: ; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] monotonic, align 4 ; OPT-NEXT: ret i32 [[VAL]] -; entry: %val = atomicrmw xor ptr addrspace(1) %in, i32 0 monotonic, align 4 ret i32 %val @@ -247,7 +243,6 @@ define i32 @global_system_monotonic_idempotent_xor__no_fine_grained(ptr addrspac ; OPT-NEXT: entry: ; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; OPT-NEXT: ret i32 [[VAL]] -; entry: %val = atomicrmw xor ptr addrspace(1) %in, i32 0 monotonic, align 4, !amdgpu.no.fine.grained.memory !0 ret i32 %val @@ -264,7 +259,6 @@ define i32 @global_agent_monotonic_idempotent_and(ptr addrspace(1) %in) { ; OPT-NEXT: entry: ; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("singlethread") monotonic, align 4 ; OPT-NEXT: ret i32 [[VAL]] -; entry: %val = atomicrmw and ptr addrspace(1) %in, i32 -1 syncscope("singlethread") monotonic, align 4 ret i32 %val @@ -281,7 +275,6 @@ define i32 @global_agent_monotonic_idempotent_and_no_fined_grain(ptr addrspace(1 ; OPT-NEXT: entry: ; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; OPT-NEXT: ret i32 [[VAL]] -; entry: %val = atomicrmw and ptr addrspace(1) %in, i32 -1 syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory !0 ret i32 %val diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index b91967bca72d..72e95db6da44 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -168,6 +168,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_bfi_b32 v2, v3, -5, -1 ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 @@ -822,6 +823,7 @@ define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index 689d1472d601..94de6dd31cad 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -103,9 +103,10 @@ ; GCN-NEXT: ; implicit-def: $vgpr197 ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ds_write_b128 v230, v[64:67] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b128 v230, v[68:71] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v226, s[8:11], 0 offen offset:64 sc0 sc1 @@ -150,10 +151,11 @@ ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0 ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ds_write_b128 v230, v[160:163] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b128 v230, v[164:167] offset:1024 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79] @@ -199,9 +201,10 @@ ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ds_write_b128 v230, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b128 v230, v[160:163] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1 @@ -280,9 +283,10 @@ ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ds_write_b128 v230, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b128 v230, v[226:229] offset:1024 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) @@ -322,15 +326,16 @@ ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ds_write_b64 v199, v[238:239] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v200, v[240:241] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v201, v[242:243] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v202, v[244:245] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dwordx2 v[192:193], v247, s[0:3], 0 offen sc0 sc1 @@ -649,15 +654,16 @@ ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ds_write_b64 v199, v[188:189] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v200, v[190:191] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v201, v[192:193] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v202, v[194:195] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47] ; GCN-NEXT: v_exp_f32_e32 v101, v125 @@ -792,16 +798,17 @@ ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ds_write_b64 v199, v[126:127] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v200, v[150:151] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v201, v[152:153] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v202, v[154:155] ; GCN-NEXT: v_fma_f32 v127, s4, v84, -v128 ; GCN-NEXT: v_exp_f32_e32 v84, v129 @@ -942,18 +949,19 @@ ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ds_write_b64 v199, v[150:151] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v200, v[152:153] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v125 ; GCN-NEXT: v_exp_f32_e32 v130, v158 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v201, v[154:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v202, v[156:157] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir index 0887fdf0844b..0a8d7acd187f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -39,6 +39,7 @@ ; GCN-NEXT: v_add_u32_e32 v76, s20, v76 ; GCN-NEXT: v_and_b32_e32 v76, 0x1fffffff, v76 ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ds_write_b128 v48, v[0:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[32:35], v4, s[0:3], 0 offen offset:64 sc0 sc1 @@ -91,6 +92,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ds_write_b128 v48, v[32:35] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31] ; GCN-NEXT: ;;#ASMSTART @@ -138,12 +140,13 @@ ; GCN-NEXT: v_perm_b32 v71, v74, v72, s3 ; GCN-NEXT: v_perm_b32 v72, v75, v73, s2 ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ds_write_b32 v76, v70 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b32 v77, v71 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b32 v78, v72 ; GCN-NEXT: v_mul_f32_e32 v74, s4, v20 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] @@ -197,7 +200,7 @@ ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 ; GCN-NEXT: v_perm_b32 v68, v75, v73, s3 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b32 v79, v68 ; GCN-NEXT: ; implicit-def: $vgpr84 ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 @@ -310,6 +313,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ds_write_b32 v76, v31 ; GCN-NEXT: v_mul_f32_e32 v31, 0x3fb8aa3b, v67 ; GCN-NEXT: v_exp_f32_e32 v31, v31 @@ -317,13 +321,13 @@ ; GCN-NEXT: v_pack_b32_f16 v18, v19, v86 ; GCN-NEXT: v_pack_b32_f16 v19, v22, v89 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b32 v77, v64 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b32 v78, v90 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_write_b32 v79, v65 ; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v73 ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v87 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-wbl2.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-wbl2.ll new file mode 100644 index 000000000000..912b267d6ee7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-wbl2.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950 %s + +; Test that vmcnt(0) is correctly preserved between buffer_wbl2 and atomic +; when there are global memory stores that need to be written back. + +define void @global_store_different_block(ptr addrspace(1) %data_ptr, ptr addrspace(1) %atomic_ptr, i1 %cond) { +; GFX950-LABEL: global_store_different_block: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX950-NEXT: v_mov_b32_e32 v4, 42 +; GFX950-NEXT: global_store_dword v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_cbranch_execz .LBB0_2 +; GFX950-NEXT: ; %bb.1: ; %do_atomic +; GFX950-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: global_atomic_swap_x2 v[2:3], v[0:1], off +; GFX950-NEXT: .LBB0_2: ; %exit +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] +entry: + ; Global store in entry block + store i32 42, ptr addrspace(1) %data_ptr, align 4 + call void @llvm.amdgcn.s.waitcnt(i32 112) + br i1 %cond, label %do_atomic, label %exit + +do_atomic: + %old = atomicrmw xchg ptr addrspace(1) %atomic_ptr, i64 0 syncscope("agent") release + br label %exit + +exit: + ret void +} + +define void @global_store_then_atomic(ptr addrspace(1) %data_ptr, ptr addrspace(1) %atomic_ptr) { +; GFX950-LABEL: global_store_then_atomic: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v4, 42 +; GFX950-NEXT: global_store_dword v[0:1], v4, off +; GFX950-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: global_atomic_swap_x2 v[2:3], v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] +entry: + store i32 42, ptr addrspace(1) %data_ptr, align 4 + %old = atomicrmw xchg ptr addrspace(1) %atomic_ptr, i64 0 syncscope("agent") release + ret void +}