From 28d4e33b654be3f839cf2cb362aed92beaaa671c Mon Sep 17 00:00:00 2001 From: Pankaj Dwivedi Date: Wed, 17 Dec 2025 17:53:00 +0530 Subject: [PATCH] [AMDGPU][SIInsertWaitCnt] Optimize loadcnt insertion at function boundaries (#169647) On GFX12+, GLOBAL_INV increments the loadcnt counter but does not write results to any VGPRs. Previously, we unconditionally inserted s_wait_loadcnt 0 at function returns even when the only pending loadcnt was from GLOBAL_INV instructions. This patch optimizes waitcnt insertion by skipping the loadcnt wait at function boundaries when no VGPRs have pending loads. This is determined by checking if any VGPR has a score greater than the lower bound for LOAD_CNT - if not, the pending loadcnt must be from non-VGPR-writing instructions like GLOBAL_INV. The optimization is limited to GFX12+ targets where GLOBAL_INV exists and uses the extended wait count instructions. This is a follow-up optimization to PR #135340 which added tracking for GLOBAL_INV in the waitcnt pass. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 31 ++- .../AMDGPU/GlobalISel/atomicrmw_fmax.ll | 16 -- .../AMDGPU/GlobalISel/atomicrmw_fmin.ll | 16 -- .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 7 +- .../CodeGen/AMDGPU/GlobalISel/mubuf-global.ll | 10 - llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 4 - .../CodeGen/AMDGPU/atomicrmw_usub_cond.ll | 14 -- .../test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll | 30 --- .../buffer-fat-pointer-atomicrmw-fadd.ll | 41 ---- .../buffer-fat-pointer-atomicrmw-fmax.ll | 32 --- .../buffer-fat-pointer-atomicrmw-fmin.ll | 32 --- .../buffer-fat-pointer-atomicrmw-usub_cond.ll | 5 - .../buffer-fat-pointer-atomicrmw-usub_sat.ll | 5 - .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 100 -------- .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 90 ------- .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 90 ------- .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 86 ------- .../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 47 ---- llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll | 4 - llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll | 4 - .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 7 +- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 224 ------------------ .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 90 ------- .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 90 ------- .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 86 ------- .../global-saddr-atomics-min-max-system.ll | 16 -- .../CodeGen/AMDGPU/global-saddr-atomics.ll | 47 ---- llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 4 - .../insert_waitcnt_for_precise_memory.ll | 3 - .../llvm.amdgcn.cooperative.atomic-agent.ll | 18 +- .../llvm.amdgcn.cooperative.atomic-system.ll | 18 +- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 42 ---- .../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 46 ---- .../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 46 ---- .../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 46 ---- .../AMDGPU/waitcnt-func-global-inv.mir | 115 +++++++++ 36 files changed, 157 insertions(+), 1405 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/waitcnt-func-global-inv.mir diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ab7f55900459..e21583ae0876 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -147,6 +147,7 @@ struct HardwareLimits { DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \ DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \ DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \ + DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \ DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \ DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \ DECL(VMEM_GROUP) /* vmem group */ \ @@ -402,7 +403,7 @@ public: assert(ST); static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = { - eventMask({VMEM_ACCESS}), + eventMask({VMEM_ACCESS, GLOBAL_INV_ACCESS}), eventMask({LDS_ACCESS, GDS_ACCESS}), eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}), @@ -536,7 +537,8 @@ public: switch (Inst.getOpcode()) { // FIXME: GLOBAL_INV needs to be tracked with xcnt too. case AMDGPU::GLOBAL_INV: - return VMEM_ACCESS; // tracked using loadcnt + return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write + // VGPRs case AMDGPU::GLOBAL_WB: case AMDGPU::GLOBAL_WBINV: return VMEM_WRITE_ACCESS; // tracked using storecnt @@ -1377,6 +1379,20 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || (T == X_CNT && hasPendingEvent(SMEM_GROUP))) return true; + + // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS), + // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause + // out-of-order completion. + if (T == LOAD_CNT) { + unsigned Events = hasPendingEvent(T); + // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed + // events + Events &= ~(1 << GLOBAL_INV_ACCESS); + // Return true only if there are still multiple event types after removing + // GLOBAL_INV + return Events & (Events - 1); + } + return hasMixedPendingEvents(T); } @@ -1946,7 +1962,16 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || Opc == AMDGPU::S_SETPC_B64_return || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { - Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); + AMDGPU::Waitcnt AllZeroWait = + WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false); + // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads + // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt. + // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's + // no need to wait for it at function boundaries. + if (ST->hasExtendedWaitCounts() && + !ScoreBrackets.hasPendingEvent(VMEM_ACCESS)) + AllZeroWait.LoadCnt = ~0u; + Wait = Wait.combined(AllZeroWait); } // In dynamic VGPR mode, we want to release the VGPRs before the wave exits. // Technically the hardware will do this on its own if we don't, but that diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 8063b29c2998..32f539a267e6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -22,7 +22,6 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f32: @@ -95,7 +94,6 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: ds_max_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f32: @@ -168,7 +166,6 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f64: @@ -245,7 +242,6 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f64: @@ -322,7 +318,6 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -469,7 +464,6 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -630,7 +624,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -786,7 +779,6 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -917,7 +909,6 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -1060,7 +1051,6 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -1220,7 +1210,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -1374,7 +1363,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1507,7 +1495,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -1664,7 +1651,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -1838,7 +1824,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -2005,7 +1990,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 5b0b602bd99b..be0ef85b217d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -22,7 +22,6 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f32: @@ -95,7 +94,6 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: ds_min_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f32: @@ -168,7 +166,6 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f64: @@ -245,7 +242,6 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f64: @@ -322,7 +318,6 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -469,7 +464,6 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -630,7 +624,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -786,7 +779,6 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -917,7 +909,6 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -1060,7 +1051,6 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -1220,7 +1210,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -1374,7 +1363,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1507,7 +1495,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -1664,7 +1651,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -1838,7 +1824,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -2005,7 +1990,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 58586129fb4e..1d0b423c1e0c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1803,7 +1803,6 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1841,7 +1840,6 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1881,7 +1879,6 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -2127,7 +2124,6 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -2165,7 +2161,6 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -2207,7 +2202,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll index b75eb737534e..5c80c27c3d28 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -1232,7 +1232,6 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr ; GFX12-NEXT: global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095 %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst @@ -1280,7 +1279,6 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace( ; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst @@ -1322,7 +1320,6 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(ptr addrspace(1) %pt ; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095 %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst @@ -1367,7 +1364,6 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace( ; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst @@ -1418,7 +1414,6 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in ; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst @@ -1463,7 +1458,6 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[1:2], s[2:3] offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095 %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -1513,7 +1507,6 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1) ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -1556,7 +1549,6 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(ptr addrspace(1) %ptr, ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095 %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -1601,7 +1593,6 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1) ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -1655,7 +1646,6 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index e7adfa01b663..18e2ae291940 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -111,7 +111,6 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SYS -; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %res @@ -215,7 +214,6 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE -; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %res @@ -350,7 +348,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 { ; GFX1200-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SE ; GFX1200-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE -; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void @@ -442,7 +439,6 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE -; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret float %res diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll index ce8ffab77ac8..e58ccfd33184 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll @@ -600,7 +600,6 @@ define i32 @global_atomic_usub_cond(ptr addrspace(1) %ptr, i32 %data) { ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: global_atomic_usub_cond: @@ -638,7 +637,6 @@ define i32 @global_atomic_usub_cond(ptr addrspace(1) %ptr, i32 %data) { ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4 ret i32 %ret @@ -684,7 +682,6 @@ define i32 @global_atomic_usub_cond_offset(ptr addrspace(1) %ptr, i32 %data) { ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: global_atomic_usub_cond_offset: @@ -723,7 +720,6 @@ define i32 @global_atomic_usub_cond_offset(ptr addrspace(1) %ptr, i32 %data) { ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4 @@ -765,7 +761,6 @@ define void @global_atomic_usub_cond_nortn(ptr addrspace(1) %ptr, i32 %data) { ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: global_atomic_usub_cond_nortn: @@ -802,7 +797,6 @@ define void @global_atomic_usub_cond_nortn(ptr addrspace(1) %ptr, i32 %data) { ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4 ret void @@ -848,7 +842,6 @@ define void @global_atomic_usub_cond_offset_nortn(ptr addrspace(1) %ptr, i32 %da ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v[0:1], v2, off offset:4096 scope:SCOPE_DEV ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: global_atomic_usub_cond_offset_nortn: @@ -887,7 +880,6 @@ define void @global_atomic_usub_cond_offset_nortn(ptr addrspace(1) %ptr, i32 %da ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v[0:1], v2, off offset:4096 scope:SCOPE_DEV ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4 @@ -1105,7 +1097,6 @@ define i32 @global_atomic_usub_cond__amdgpu_no_remote_memory(ptr addrspace(1) %p ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: global_atomic_usub_cond__amdgpu_no_remote_memory: @@ -1143,7 +1134,6 @@ define i32 @global_atomic_usub_cond__amdgpu_no_remote_memory(ptr addrspace(1) %p ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 ret i32 %ret @@ -1185,7 +1175,6 @@ define i32 @global_atomic_usub_cond__amdgpu_no_fine_grained_memory(ptr addrspace ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: global_atomic_usub_cond__amdgpu_no_fine_grained_memory: @@ -1223,7 +1212,6 @@ define i32 @global_atomic_usub_cond__amdgpu_no_fine_grained_memory(ptr addrspace ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret i32 %ret @@ -1265,7 +1253,6 @@ define i32 @global_atomic_usub_cond__amdgpu_no_fine_grained_memory__amdgpu_no_re ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: global_atomic_usub_cond__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1303,7 +1290,6 @@ define i32 @global_atomic_usub_cond__amdgpu_no_fine_grained_memory__amdgpu_no_re ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret i32 %ret diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll index 708556aae76f..cd0995eabd1b 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll @@ -62,7 +62,6 @@ define i32 @global_atomic_usub_sat(ptr addrspace(1) %ptr, i32 %data) { ; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat: @@ -118,7 +117,6 @@ define i32 @global_atomic_usub_sat(ptr addrspace(1) %ptr, i32 %data) { ; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 ret i32 %ret @@ -184,7 +182,6 @@ define i32 @global_atomic_usub_sat_offset(ptr addrspace(1) %ptr, i32 %data) { ; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat_offset: @@ -249,7 +246,6 @@ define i32 @global_atomic_usub_sat_offset(ptr addrspace(1) %ptr, i32 %data) { ; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 @@ -309,7 +305,6 @@ define void @global_atomic_usub_sat_nortn(ptr addrspace(1) %ptr, i32 %data) { ; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat_nortn: @@ -364,7 +359,6 @@ define void @global_atomic_usub_sat_nortn(ptr addrspace(1) %ptr, i32 %data) { ; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 ret void @@ -430,7 +424,6 @@ define void @global_atomic_usub_sat_offset_nortn(ptr addrspace(1) %ptr, i32 %dat ; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat_offset_nortn: @@ -495,7 +488,6 @@ define void @global_atomic_usub_sat_offset_nortn(ptr addrspace(1) %ptr, i32 %dat ; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 @@ -899,7 +891,6 @@ define i16 @global_atomic_usub_sat_16(ptr addrspace(1) %ptr, i16 %data) { ; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat_16: @@ -1011,7 +1002,6 @@ define i16 @global_atomic_usub_sat_16(ptr addrspace(1) %ptr, i16 %data) { ; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i16 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 ret i16 %ret @@ -1128,7 +1118,6 @@ define i16 @global_atomic_usub_sat_offset_16(ptr addrspace(1) %ptr, i16 %data) { ; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat_offset_16: @@ -1241,7 +1230,6 @@ define i16 @global_atomic_usub_sat_offset_16(ptr addrspace(1) %ptr, i16 %data) { ; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i16, ptr addrspace(1) %ptr, i64 1024 %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i16 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 @@ -1353,7 +1341,6 @@ define void @global_atomic_usub_sat_nortn_16(ptr addrspace(1) %ptr, i16 %data) { ; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat_nortn_16: @@ -1460,7 +1447,6 @@ define void @global_atomic_usub_sat_nortn_16(ptr addrspace(1) %ptr, i16 %data) { ; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i16 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 ret void @@ -1573,7 +1559,6 @@ define void @global_atomic_usub_sat_offset_nortn_16(ptr addrspace(1) %ptr, i16 % ; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat_offset_nortn_16: @@ -1682,7 +1667,6 @@ define void @global_atomic_usub_sat_offset_nortn_16(ptr addrspace(1) %ptr, i16 % ; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i16, ptr addrspace(1) %ptr, i64 1024 %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i16 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 @@ -2322,7 +2306,6 @@ define i8 @global_atomic_usub_sat_8(ptr addrspace(1) %ptr, i8 %data) { ; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat_8: @@ -2441,7 +2424,6 @@ define i8 @global_atomic_usub_sat_8(ptr addrspace(1) %ptr, i8 %data) { ; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i8 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 ret i8 %ret @@ -2574,7 +2556,6 @@ define i8 @global_atomic_usub_sat_offset_8(ptr addrspace(1) %ptr, i8 %data) { ; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat_offset_8: @@ -2693,7 +2674,6 @@ define i8 @global_atomic_usub_sat_offset_8(ptr addrspace(1) %ptr, i8 %data) { ; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %ptr, i64 1024 %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i8 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 @@ -2821,7 +2801,6 @@ define void @global_atomic_usub_sat_nortn_8(ptr addrspace(1) %ptr, i8 %data) { ; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat_nortn_8: @@ -2935,7 +2914,6 @@ define void @global_atomic_usub_sat_nortn_8(ptr addrspace(1) %ptr, i8 %data) { ; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i8 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 ret void @@ -3062,7 +3040,6 @@ define void @global_atomic_usub_sat_offset_nortn_8(ptr addrspace(1) %ptr, i8 %da ; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat_offset_nortn_8: @@ -3176,7 +3153,6 @@ define void @global_atomic_usub_sat_offset_nortn_8(ptr addrspace(1) %ptr, i8 %da ; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %ptr, i64 1024 %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i8 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 @@ -3791,7 +3767,6 @@ define i32 @global_atomic_usub_sat__amdgpu_no_remote_memory(ptr addrspace(1) %pt ; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat__amdgpu_no_remote_memory: @@ -3847,7 +3822,6 @@ define i32 @global_atomic_usub_sat__amdgpu_no_remote_memory(ptr addrspace(1) %pt ; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 ret i32 %ret @@ -3907,7 +3881,6 @@ define i32 @global_atomic_usub_sat__amdgpu_no_fine_grained_memory(ptr addrspace( ; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat__amdgpu_no_fine_grained_memory: @@ -3963,7 +3936,6 @@ define i32 @global_atomic_usub_sat__amdgpu_no_fine_grained_memory(ptr addrspace( ; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret i32 %ret @@ -4023,7 +3995,6 @@ define i32 @global_atomic_usub_sat__amdgpu_no_fine_grained_memory__amdgpu_no_rem ; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_atomic_usub_sat__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -4079,7 +4050,6 @@ define i32 @global_atomic_usub_sat__amdgpu_no_fine_grained_memory__amdgpu_no_rem ; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret i32 %ret diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index efb75e95212b..70e7ec2ea8b6 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -28,7 +28,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -213,7 +212,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -397,7 +395,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -786,7 +783,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -987,7 +983,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -1181,7 +1176,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: @@ -1399,7 +1393,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: @@ -1617,7 +1610,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -1858,7 +1850,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2100,7 +2091,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2370,7 +2360,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB10_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2851,7 +2840,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: @@ -3112,7 +3100,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -3373,7 +3360,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3422,7 +3408,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3811,7 +3796,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3859,7 +3843,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -4272,7 +4255,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4355,7 +4337,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5053,7 +5034,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5112,7 +5092,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5573,7 +5552,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5631,7 +5609,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -6115,7 +6092,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6209,7 +6185,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6921,7 +6896,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -7163,7 +7137,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -7407,7 +7380,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -7875,7 +7847,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: @@ -8133,7 +8104,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: @@ -8387,7 +8357,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: @@ -8645,7 +8614,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: @@ -8903,7 +8871,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -9330,7 +9297,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -9764,7 +9730,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -10473,7 +10438,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: @@ -10900,7 +10864,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: @@ -11314,7 +11277,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11741,7 +11703,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: @@ -12155,7 +12116,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -12574,7 +12534,6 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index f3bf8c664f7a..064b36cc261b 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -28,7 +28,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -200,7 +199,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -388,7 +386,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -752,7 +749,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: @@ -1003,7 +999,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1201,7 +1196,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1383,7 +1377,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1602,7 +1595,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -1985,7 +1977,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: @@ -2251,7 +2242,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2453,7 +2443,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2504,7 +2493,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2912,7 +2900,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2962,7 +2949,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3394,7 +3380,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3480,7 +3465,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4194,7 +4178,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4253,7 +4236,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4716,7 +4698,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4774,7 +4755,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5260,7 +5240,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5354,7 +5333,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6089,7 +6067,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6399,7 +6376,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6740,7 +6716,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -7323,7 +7298,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7377,7 +7351,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7839,7 +7812,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7889,7 +7861,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -8378,7 +8349,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -8468,7 +8438,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -9182,7 +9151,6 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index d1dc76f32137..c8e7540124fd 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -28,7 +28,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -200,7 +199,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -388,7 +386,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -752,7 +749,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: @@ -1003,7 +999,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1201,7 +1196,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1383,7 +1377,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1602,7 +1595,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -1985,7 +1977,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: @@ -2251,7 +2242,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2453,7 +2443,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2504,7 +2493,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2912,7 +2900,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2962,7 +2949,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3394,7 +3380,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3480,7 +3465,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4194,7 +4178,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4253,7 +4236,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4716,7 +4698,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4774,7 +4755,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5260,7 +5240,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5354,7 +5333,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6089,7 +6067,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6399,7 +6376,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6740,7 +6716,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -7323,7 +7298,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7377,7 +7351,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7839,7 +7812,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7889,7 +7861,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -8378,7 +8349,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -8468,7 +8438,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -9182,7 +9151,6 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll index 25bad218926f..2414290ccd62 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll @@ -20,7 +20,6 @@ define i32 @buffer_fat_ptr_agent_atomic_usub_cond_ret_u32__offset__amdgpu_no_fin ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_usub_cond_ret_u32__offset__amdgpu_no_fine_grained_memory: @@ -71,7 +70,6 @@ define void @buffer_fat_ptr_agent_atomic_usub_cond_noret_u32__offset__amdgpu_no_ ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_usub_cond_noret_u32__offset__amdgpu_no_fine_grained_memory: @@ -121,7 +119,6 @@ define i32 @buffer_fat_ptr_agent_atomic_usub_cond_ret_u32__offset__amdgpu_no_rem ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_usub_cond_ret_u32__offset__amdgpu_no_remote_memory: @@ -172,7 +169,6 @@ define i32 @buffer_fat_ptr_agent_atomic_usub_cond_ret_u32__offset__amdgpu_no_fin ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_usub_cond_ret_u32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -228,7 +224,6 @@ define i32 @buffer_fat_ptr_system_atomic_usub_cond_ret_u32__offset__amdgpu_no_fi ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_system_atomic_usub_cond_ret_u32__offset__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_sat.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_sat.ll index 836830cfa418..b4400e6f779c 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_sat.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_sat.ll @@ -20,7 +20,6 @@ define i32 @buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine ; GFX12-NEXT: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine_grained_memory: @@ -77,7 +76,6 @@ define void @buffer_fat_ptr_agent_atomic_usub_sat_noret_u32__offset__amdgpu_no_f ; GFX12-NEXT: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_usub_sat_noret_u32__offset__amdgpu_no_fine_grained_memory: @@ -133,7 +131,6 @@ define i32 @buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_remo ; GFX12-NEXT: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_remote_memory: @@ -190,7 +187,6 @@ define i32 @buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine ; GFX12-NEXT: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -252,7 +248,6 @@ define i32 @buffer_fat_ptr_system_atomic_usub_sat_ret_u32__offset__amdgpu_no_fin ; GFX12-NEXT: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_system_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 66d8b3f54d77..0bc1fca0409c 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -26,7 +26,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -203,7 +202,6 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -390,7 +388,6 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -587,7 +584,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -794,7 +790,6 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -1012,7 +1007,6 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -1237,7 +1231,6 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -1427,7 +1420,6 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -1647,7 +1639,6 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: @@ -1812,7 +1803,6 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: @@ -1964,7 +1954,6 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -2182,7 +2171,6 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: @@ -2351,7 +2339,6 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2528,7 +2515,6 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2715,7 +2701,6 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2912,7 +2897,6 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -3119,7 +3103,6 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -3337,7 +3320,6 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -3562,7 +3544,6 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -3752,7 +3733,6 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -3973,7 +3953,6 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -4163,7 +4142,6 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -4383,7 +4361,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4548,7 +4525,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4706,7 +4682,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: @@ -4871,7 +4846,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: @@ -5029,7 +5003,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -5206,7 +5179,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -5413,7 +5385,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: @@ -5562,7 +5533,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: @@ -5753,7 +5723,6 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: @@ -6159,7 +6128,6 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB31_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB31_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -6620,7 +6588,6 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB32_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB32_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -7078,7 +7045,6 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: .LBB33_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -7511,7 +7477,6 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB34_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -7960,7 +7925,6 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB35_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -8432,7 +8396,6 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -8477,7 +8440,6 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -8805,7 +8767,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8851,7 +8812,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9189,7 +9149,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9235,7 +9194,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9572,7 +9530,6 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9615,7 +9572,6 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9932,7 +9888,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9976,7 +9931,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10303,7 +10257,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10347,7 +10300,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10663,7 +10615,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10695,7 +10646,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10938,7 +10888,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10972,7 +10921,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -11235,7 +11183,6 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11282,7 +11229,6 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11622,7 +11568,6 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11667,7 +11612,6 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12013,7 +11957,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -12067,7 +12010,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -12466,7 +12408,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12523,7 +12464,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12933,7 +12873,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12990,7 +12929,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13399,7 +13337,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13454,7 +13391,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13851,7 +13787,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13906,7 +13841,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14294,7 +14228,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -14338,7 +14271,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -14665,7 +14597,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -14707,7 +14638,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -15037,7 +14967,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -15089,7 +15018,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -15478,7 +15406,6 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15536,7 +15463,6 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15947,7 +15873,6 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16003,7 +15928,6 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16364,7 +16288,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -16552,7 +16475,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16743,7 +16665,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16947,7 +16868,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -17127,7 +17047,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17314,7 +17233,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17518,7 +17436,6 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17712,7 +17629,6 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17901,7 +17817,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: @@ -18089,7 +18004,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -18269,7 +18183,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -18457,7 +18370,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -18641,7 +18553,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -18964,7 +18875,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19290,7 +19200,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -19630,7 +19539,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -19943,7 +19851,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -20263,7 +20170,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -20603,7 +20509,6 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -20932,7 +20837,6 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -21254,7 +21158,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: @@ -21577,7 +21480,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: @@ -21890,7 +21792,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -22213,7 +22114,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 8aa1a35f3c60..6831485790ab 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -26,7 +26,6 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -169,7 +168,6 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -318,7 +316,6 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -483,7 +480,6 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -624,7 +620,6 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -772,7 +767,6 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -940,7 +934,6 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1092,7 +1085,6 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1242,7 +1234,6 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1435,7 +1426,6 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1582,7 +1572,6 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1725,7 +1714,6 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1874,7 +1862,6 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2039,7 +2026,6 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2180,7 +2166,6 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2328,7 +2313,6 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2496,7 +2480,6 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2648,7 +2631,6 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2848,7 +2830,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -3206,7 +3187,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -3618,7 +3598,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -4027,7 +4006,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] @@ -4416,7 +4394,6 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] @@ -4821,7 +4798,6 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB23_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] @@ -5260,7 +5236,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -5684,7 +5659,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -6065,7 +6039,6 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -6112,7 +6085,6 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -6460,7 +6432,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6509,7 +6480,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6869,7 +6839,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6918,7 +6887,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7274,7 +7242,6 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -7320,7 +7287,6 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -7656,7 +7622,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7704,7 +7669,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8052,7 +8016,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8100,7 +8063,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8437,7 +8399,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8473,7 +8434,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8739,7 +8699,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -8774,7 +8733,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9050,7 +9008,6 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9100,7 +9057,6 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9461,7 +9417,6 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9510,7 +9465,6 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9873,7 +9827,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9927,7 +9880,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -10327,7 +10279,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10384,7 +10335,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10795,7 +10745,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10852,7 +10801,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11261,7 +11209,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -11313,7 +11260,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -11700,7 +11646,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11755,7 +11700,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12153,7 +12097,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12208,7 +12151,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12597,7 +12539,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12641,7 +12582,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12969,7 +12909,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -13011,7 +12950,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -13346,7 +13284,6 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13404,7 +13341,6 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13816,7 +13752,6 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13872,7 +13807,6 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14253,7 +14187,6 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -14489,7 +14422,6 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14728,7 +14660,6 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14981,7 +14912,6 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -15208,7 +15138,6 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15442,7 +15371,6 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15697,7 +15625,6 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15938,7 +15865,6 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16199,7 +16125,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16251,7 +16176,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16646,7 +16570,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16698,7 +16621,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17096,7 +17018,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17148,7 +17069,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17559,7 +17479,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -17610,7 +17529,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -17991,7 +17909,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18042,7 +17959,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18430,7 +18346,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -18481,7 +18396,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -18895,7 +18809,6 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18948,7 +18861,6 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19346,7 +19258,6 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19398,7 +19309,6 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index be3143da0a95..4c659f9ca917 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -26,7 +26,6 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -169,7 +168,6 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -318,7 +316,6 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -483,7 +480,6 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -624,7 +620,6 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -772,7 +767,6 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -940,7 +934,6 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1092,7 +1085,6 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1242,7 +1234,6 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1435,7 +1426,6 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1582,7 +1572,6 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1725,7 +1714,6 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1874,7 +1862,6 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2039,7 +2026,6 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2180,7 +2166,6 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2328,7 +2313,6 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2496,7 +2480,6 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2648,7 +2631,6 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2848,7 +2830,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -3206,7 +3187,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -3618,7 +3598,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -4027,7 +4006,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] @@ -4416,7 +4394,6 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] @@ -4821,7 +4798,6 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB23_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] @@ -5260,7 +5236,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -5684,7 +5659,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -6065,7 +6039,6 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -6112,7 +6085,6 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -6460,7 +6432,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6509,7 +6480,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6869,7 +6839,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6918,7 +6887,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7274,7 +7242,6 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -7320,7 +7287,6 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -7656,7 +7622,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7704,7 +7669,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8052,7 +8016,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8100,7 +8063,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8437,7 +8399,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8473,7 +8434,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8739,7 +8699,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -8774,7 +8733,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9050,7 +9008,6 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9100,7 +9057,6 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9461,7 +9417,6 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9510,7 +9465,6 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9873,7 +9827,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9927,7 +9880,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -10327,7 +10279,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10384,7 +10335,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10795,7 +10745,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10852,7 +10801,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11261,7 +11209,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -11313,7 +11260,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -11700,7 +11646,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11755,7 +11700,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12153,7 +12097,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12208,7 +12151,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12597,7 +12539,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12641,7 +12582,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12969,7 +12909,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -13011,7 +12950,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -13346,7 +13284,6 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13404,7 +13341,6 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13816,7 +13752,6 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13872,7 +13807,6 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14253,7 +14187,6 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -14489,7 +14422,6 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14728,7 +14660,6 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14981,7 +14912,6 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -15208,7 +15138,6 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15442,7 +15371,6 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15697,7 +15625,6 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15938,7 +15865,6 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16199,7 +16125,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16251,7 +16176,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16646,7 +16570,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16698,7 +16621,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17096,7 +17018,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17148,7 +17069,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17559,7 +17479,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -17610,7 +17529,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -17991,7 +17909,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18042,7 +17959,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18430,7 +18346,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -18481,7 +18396,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -18895,7 +18809,6 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18948,7 +18861,6 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19346,7 +19258,6 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19398,7 +19309,6 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 37fad11cd503..9166ad304347 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -43,7 +43,6 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32: @@ -239,7 +238,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: @@ -439,7 +437,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -652,7 +649,6 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f32: @@ -838,7 +834,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1031,7 +1026,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1246,7 +1240,6 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: @@ -1447,7 +1440,6 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1648,7 +1640,6 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: @@ -1844,7 +1835,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -2044,7 +2034,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2257,7 +2246,6 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: @@ -2443,7 +2431,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2636,7 +2623,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -2851,7 +2837,6 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -3052,7 +3037,6 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3278,7 +3262,6 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f64: @@ -3661,7 +3644,6 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: .LBB17_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB17_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -4107,7 +4089,6 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -4550,7 +4531,6 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -4970,7 +4950,6 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -5406,7 +5385,6 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -5865,7 +5843,6 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16: @@ -5910,7 +5887,6 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16: @@ -6238,7 +6214,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -6284,7 +6259,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -6622,7 +6596,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -6668,7 +6641,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -7005,7 +6977,6 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16: @@ -7048,7 +7019,6 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16: @@ -7365,7 +7335,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -7409,7 +7378,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -7736,7 +7704,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -7780,7 +7747,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -8097,7 +8063,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -8131,7 +8096,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -8380,7 +8344,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -8412,7 +8375,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -8668,7 +8630,6 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: @@ -8715,7 +8676,6 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: @@ -9055,7 +9015,6 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: @@ -9100,7 +9059,6 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: @@ -9446,7 +9404,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16: @@ -9500,7 +9457,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16: @@ -9899,7 +9855,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -9956,7 +9911,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -10366,7 +10320,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -10423,7 +10376,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -10831,7 +10783,6 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16: @@ -10883,7 +10834,6 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16: @@ -11269,7 +11219,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -11324,7 +11273,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -11721,7 +11669,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -11776,7 +11723,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -12164,7 +12110,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -12208,7 +12153,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -12535,7 +12479,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -12577,7 +12520,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -12911,7 +12853,6 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -12969,7 +12910,6 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -13380,7 +13320,6 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -13436,7 +13375,6 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -13814,7 +13752,6 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2f16: @@ -14033,7 +13970,6 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: @@ -14255,7 +14191,6 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -14490,7 +14425,6 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2f16: @@ -14698,7 +14632,6 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -14913,7 +14846,6 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -15150,7 +15082,6 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: @@ -15373,7 +15304,6 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -15618,7 +15548,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16: @@ -15670,7 +15599,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2bf16: @@ -16065,7 +15993,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -16117,7 +16044,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -16515,7 +16441,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -16567,7 +16492,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -16978,7 +16902,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16: @@ -17029,7 +16952,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2bf16: @@ -17410,7 +17332,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -17461,7 +17382,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -17849,7 +17769,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -17900,7 +17819,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -18314,7 +18232,6 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -18367,7 +18284,6 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -18765,7 +18681,6 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -18817,7 +18732,6 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 6ae3cfb7e106..bfeeae68d4ca 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -143,7 +143,6 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn: @@ -184,7 +183,6 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voff ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn_2048: @@ -226,7 +224,6 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %v ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn_neg2048: @@ -286,7 +283,6 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn: @@ -303,7 +299,6 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn: @@ -355,7 +350,6 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: @@ -372,7 +366,6 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: @@ -571,7 +564,6 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1250-SDAG-NEXT: .LBB10_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB10_5 ; GFX1250-SDAG-NEXT: .LBB10_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV @@ -618,7 +610,6 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1250-GISEL-NEXT: .LBB10_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB10_5 ; GFX1250-GISEL-NEXT: .LBB10_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV @@ -753,7 +744,6 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1250-SDAG-NEXT: .LBB11_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB11_5 ; GFX1250-SDAG-NEXT: .LBB11_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV @@ -803,7 +793,6 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1250-GISEL-NEXT: .LBB11_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB11_5 ; GFX1250-GISEL-NEXT: .LBB11_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV @@ -1240,7 +1229,6 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_add_saddr_i32_rtn: @@ -1281,7 +1269,6 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_add_saddr_i32_rtn_neg128: @@ -1425,7 +1412,6 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB18_4 ; GFX1250-SDAG-NEXT: .LBB18_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB18_5 ; GFX1250-SDAG-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV @@ -1472,7 +1458,6 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB18_4 ; GFX1250-GISEL-NEXT: .LBB18_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB18_5 ; GFX1250-GISEL-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV @@ -1611,7 +1596,6 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_4 ; GFX1250-SDAG-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB19_5 ; GFX1250-SDAG-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV @@ -1661,7 +1645,6 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB19_4 ; GFX1250-GISEL-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB19_5 ; GFX1250-GISEL-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV @@ -2130,7 +2113,6 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_sub_saddr_i32_rtn: @@ -2171,7 +2153,6 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_sub_saddr_i32_rtn_neg128: @@ -2315,7 +2296,6 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB26_4 ; GFX1250-SDAG-NEXT: .LBB26_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB26_5 ; GFX1250-SDAG-NEXT: .LBB26_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV @@ -2362,7 +2342,6 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB26_4 ; GFX1250-GISEL-NEXT: .LBB26_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB26_5 ; GFX1250-GISEL-NEXT: .LBB26_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV @@ -2503,7 +2482,6 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_4 ; GFX1250-SDAG-NEXT: .LBB27_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB27_5 ; GFX1250-SDAG-NEXT: .LBB27_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV @@ -2553,7 +2531,6 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB27_4 ; GFX1250-GISEL-NEXT: .LBB27_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB27_5 ; GFX1250-GISEL-NEXT: .LBB27_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV @@ -3028,7 +3005,6 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_and_saddr_i32_rtn: @@ -3069,7 +3045,6 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_and_saddr_i32_rtn_neg128: @@ -3213,7 +3188,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB34_4 ; GFX1250-SDAG-NEXT: .LBB34_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB34_5 ; GFX1250-SDAG-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV @@ -3261,7 +3235,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB34_4 ; GFX1250-GISEL-NEXT: .LBB34_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB34_5 ; GFX1250-GISEL-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV @@ -3401,7 +3374,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_4 ; GFX1250-SDAG-NEXT: .LBB35_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB35_5 ; GFX1250-SDAG-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV @@ -3452,7 +3424,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB35_4 ; GFX1250-GISEL-NEXT: .LBB35_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB35_5 ; GFX1250-GISEL-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV @@ -3926,7 +3897,6 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i3 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_or_saddr_i32_rtn: @@ -3967,7 +3937,6 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voff ; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_or_saddr_i32_rtn_neg128: @@ -4111,7 +4080,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB42_4 ; GFX1250-SDAG-NEXT: .LBB42_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB42_5 ; GFX1250-SDAG-NEXT: .LBB42_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV @@ -4159,7 +4127,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB42_4 ; GFX1250-GISEL-NEXT: .LBB42_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB42_5 ; GFX1250-GISEL-NEXT: .LBB42_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV @@ -4299,7 +4266,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_4 ; GFX1250-SDAG-NEXT: .LBB43_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB43_5 ; GFX1250-SDAG-NEXT: .LBB43_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV @@ -4350,7 +4316,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB43_4 ; GFX1250-GISEL-NEXT: .LBB43_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB43_5 ; GFX1250-GISEL-NEXT: .LBB43_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV @@ -4824,7 +4789,6 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_xor_saddr_i32_rtn: @@ -4865,7 +4829,6 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_xor_saddr_i32_rtn_neg128: @@ -5009,7 +4972,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB50_4 ; GFX1250-SDAG-NEXT: .LBB50_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB50_5 ; GFX1250-SDAG-NEXT: .LBB50_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV @@ -5057,7 +5019,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB50_4 ; GFX1250-GISEL-NEXT: .LBB50_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB50_5 ; GFX1250-GISEL-NEXT: .LBB50_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV @@ -5197,7 +5158,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_4 ; GFX1250-SDAG-NEXT: .LBB51_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB51_5 ; GFX1250-SDAG-NEXT: .LBB51_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV @@ -5248,7 +5208,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB51_4 ; GFX1250-GISEL-NEXT: .LBB51_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB51_5 ; GFX1250-GISEL-NEXT: .LBB51_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV @@ -9067,7 +9026,6 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffse ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_rtn: @@ -9111,7 +9069,6 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_rtn_neg128: @@ -9262,7 +9219,6 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB90_4 ; GFX1250-SDAG-NEXT: .LBB90_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB90_5 ; GFX1250-SDAG-NEXT: .LBB90_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS @@ -9311,7 +9267,6 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB90_4 ; GFX1250-GISEL-NEXT: .LBB90_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB90_5 ; GFX1250-GISEL-NEXT: .LBB90_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS @@ -9461,7 +9416,6 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_4 ; GFX1250-SDAG-NEXT: .LBB91_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB91_5 ; GFX1250-SDAG-NEXT: .LBB91_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS @@ -9513,7 +9467,6 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB91_4 ; GFX1250-GISEL-NEXT: .LBB91_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB91_5 ; GFX1250-GISEL-NEXT: .LBB91_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index 8b7c49b5931a..63520f5ffd50 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -1935,7 +1935,6 @@ define amdgpu_ps float @atomic_flat_load_saddr_i32(ptr inreg %sbase, i32 %voffse ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -1951,7 +1950,6 @@ define amdgpu_ps float @atomic_flat_load_saddr_i32_immneg128(ptr inreg %sbase, i ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -1968,7 +1966,6 @@ define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64(ptr inreg %sbase, i32 % ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -1984,7 +1981,6 @@ define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64_immneg128(ptr inreg %sb ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll index 66d859fbd66e..c9d4fd901158 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll @@ -113,7 +113,6 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: flat_atomic_fadd_f32_rtn_pat: @@ -126,7 +125,6 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX1250-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 ret float %ret @@ -185,7 +183,6 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: local_atomic_fadd_v2f16_rtn: @@ -253,7 +250,6 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: local_atomic_fadd_v2bf16_rtn: diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index d1db407cdbe0..608c48d20793 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1684,7 +1684,6 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1722,7 +1721,6 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1762,7 +1760,6 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1989,7 +1986,6 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -2027,7 +2023,6 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -2069,7 +2064,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 21e599481999..8febea5445ef 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -27,7 +27,6 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: @@ -41,7 +40,6 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: @@ -224,7 +222,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -238,7 +235,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -423,7 +419,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -437,7 +432,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -632,7 +626,6 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: @@ -646,7 +639,6 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: @@ -823,7 +815,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -837,7 +828,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1017,7 +1007,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1031,7 +1020,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1219,7 +1207,6 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1234,7 +1221,6 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1421,7 +1407,6 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1436,7 +1421,6 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1618,7 +1602,6 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: @@ -1632,7 +1615,6 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: @@ -1833,7 +1815,6 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: @@ -1847,7 +1828,6 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: @@ -2048,7 +2028,6 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: @@ -2062,7 +2041,6 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: @@ -2256,7 +2234,6 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: @@ -2270,7 +2247,6 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: @@ -2455,7 +2431,6 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -2469,7 +2444,6 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -2640,7 +2614,6 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: @@ -2654,7 +2627,6 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: @@ -2855,7 +2827,6 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: @@ -2869,7 +2840,6 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: @@ -3063,7 +3033,6 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: @@ -3077,7 +3046,6 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: @@ -3257,7 +3225,6 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -3271,7 +3238,6 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -3425,7 +3391,6 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: @@ -3439,7 +3404,6 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: @@ -3633,7 +3597,6 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: @@ -3647,7 +3610,6 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: @@ -3846,7 +3808,6 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: @@ -3860,7 +3821,6 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: @@ -4051,7 +4011,6 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4065,7 +4024,6 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4264,7 +4222,6 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4278,7 +4235,6 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4469,7 +4425,6 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: @@ -4483,7 +4438,6 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: @@ -4666,7 +4620,6 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: @@ -4680,7 +4633,6 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: @@ -4861,7 +4813,6 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -4875,7 +4826,6 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -5044,7 +4994,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5058,7 +5007,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5229,7 +5177,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -5243,7 +5190,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -5424,7 +5370,6 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -5438,7 +5383,6 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -5589,7 +5533,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5603,7 +5546,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5757,7 +5699,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -5771,7 +5712,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -5933,7 +5873,6 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5948,7 +5887,6 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -6121,7 +6059,6 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -6136,7 +6073,6 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -6292,7 +6228,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -6306,7 +6241,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -6477,7 +6411,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -6491,7 +6424,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -6645,7 +6577,6 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: @@ -6659,7 +6590,6 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: @@ -6858,7 +6788,6 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: @@ -6872,7 +6801,6 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: @@ -7063,7 +6991,6 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -7077,7 +7004,6 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -7246,7 +7172,6 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -7260,7 +7185,6 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -7415,7 +7339,6 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: @@ -7446,7 +7369,6 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: @@ -7649,7 +7571,6 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7680,7 +7601,6 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7884,7 +7804,6 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7915,7 +7834,6 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8123,7 +8041,6 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -8152,7 +8069,6 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -8339,7 +8255,6 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8368,7 +8283,6 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8558,7 +8472,6 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8587,7 +8500,6 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8820,7 +8732,6 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -8863,7 +8774,6 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -8907,7 +8817,6 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -8952,7 +8861,6 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -9328,7 +9236,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9371,7 +9278,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9416,7 +9322,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9462,7 +9367,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9850,7 +9754,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9894,7 +9797,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9939,7 +9841,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9985,7 +9886,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10372,7 +10272,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB47_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -10413,7 +10312,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB47_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -10456,7 +10354,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -10499,7 +10396,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -10862,7 +10758,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB48_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10903,7 +10798,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB48_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10947,7 +10841,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10991,7 +10884,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11366,7 +11258,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB49_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11408,7 +11299,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB49_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11452,7 +11342,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11496,7 +11385,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11861,7 +11749,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -11893,7 +11780,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -11926,7 +11812,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -11960,7 +11845,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12246,7 +12130,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB51_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -12276,7 +12159,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB51_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -12308,7 +12190,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -12340,7 +12221,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -12629,7 +12509,6 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12672,7 +12551,6 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12718,7 +12596,6 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12765,7 +12642,6 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13153,7 +13029,6 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13194,7 +13069,6 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13239,7 +13113,6 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13284,7 +13157,6 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13672,7 +13544,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -13718,7 +13589,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -13775,7 +13645,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -13829,7 +13698,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -14269,7 +14137,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14315,7 +14182,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14373,7 +14239,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14430,7 +14295,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14884,7 +14748,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14932,7 +14795,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14990,7 +14852,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15047,7 +14908,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15499,7 +15359,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -15543,7 +15402,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -15598,7 +15456,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -15650,7 +15507,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -16076,7 +15932,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16120,7 +15975,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16176,7 +16030,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16231,7 +16084,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16671,7 +16523,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16717,7 +16568,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16773,7 +16623,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16828,7 +16677,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17259,7 +17107,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -17295,7 +17142,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -17341,7 +17187,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -17385,7 +17230,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -17744,7 +17588,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -17778,7 +17621,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -17822,7 +17664,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -17864,7 +17705,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -18226,7 +18066,6 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18272,7 +18111,6 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18331,7 +18169,6 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18389,7 +18226,6 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18843,7 +18679,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB63_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-FAKE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18887,7 +18722,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB63_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-TRUE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18944,7 +18778,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB63_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19000,7 +18833,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB63_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19409,7 +19241,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -19423,7 +19254,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -19652,7 +19482,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19666,7 +19495,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19897,7 +19725,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -19911,7 +19738,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -20146,7 +19972,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -20160,7 +19985,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -20367,7 +20191,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -20381,7 +20204,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -20591,7 +20413,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -20605,7 +20426,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -20823,7 +20643,6 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -20838,7 +20657,6 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -21071,7 +20889,6 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -21086,7 +20903,6 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -21298,7 +21114,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: @@ -21312,7 +21127,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: @@ -21555,7 +21369,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -21569,7 +21382,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -21802,7 +21614,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -21816,7 +21627,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -22045,7 +21855,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -22059,7 +21868,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -22266,7 +22074,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: @@ -22280,7 +22087,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: @@ -22523,7 +22329,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: @@ -22537,7 +22342,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: @@ -22774,7 +22578,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -22788,7 +22591,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -23162,7 +22964,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -23176,7 +22977,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -23552,7 +23352,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -23566,7 +23365,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -23946,7 +23744,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -23960,7 +23757,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -24322,7 +24118,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -24336,7 +24131,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -24701,7 +24495,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -24715,7 +24508,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -25088,7 +24880,6 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -25103,7 +24894,6 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -25481,7 +25271,6 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -25496,7 +25285,6 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -25863,7 +25651,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: @@ -25877,7 +25664,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: @@ -26251,7 +26037,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: @@ -26265,7 +26050,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: @@ -26627,7 +26411,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -26641,7 +26424,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -27015,7 +26797,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -27029,7 +26810,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -27391,7 +27171,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: @@ -27405,7 +27184,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: @@ -27779,7 +27557,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: @@ -27793,7 +27570,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index dbd48d2a7cf8..31ec099d41cf 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -27,7 +27,6 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -189,7 +188,6 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -353,7 +351,6 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -517,7 +514,6 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -673,7 +669,6 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -832,7 +827,6 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -992,7 +986,6 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1159,7 +1152,6 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1320,7 +1312,6 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1551,7 +1542,6 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1717,7 +1707,6 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1879,7 +1868,6 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2043,7 +2031,6 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2207,7 +2194,6 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2363,7 +2349,6 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2522,7 +2507,6 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2682,7 +2666,6 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2849,7 +2832,6 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -3033,7 +3015,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -3206,7 +3187,6 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3380,7 +3360,6 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3553,7 +3532,6 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -3716,7 +3694,6 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3882,7 +3859,6 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -4049,7 +4025,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -4297,7 +4272,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -4488,7 +4462,6 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -4535,7 +4508,6 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -4933,7 +4905,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -4982,7 +4953,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5394,7 +5364,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5443,7 +5412,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5851,7 +5819,6 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5897,7 +5864,6 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -6282,7 +6248,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6330,7 +6295,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6728,7 +6692,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6776,7 +6739,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7163,7 +7125,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -7199,7 +7160,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -7504,7 +7464,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7539,7 +7498,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7852,7 +7810,6 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7902,7 +7859,6 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8315,7 +8271,6 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8364,7 +8319,6 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8777,7 +8731,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -8831,7 +8784,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9282,7 +9234,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9339,7 +9290,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9803,7 +9753,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9860,7 +9809,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10322,7 +10270,6 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -10374,7 +10321,6 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -10811,7 +10757,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10866,7 +10811,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11315,7 +11259,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11370,7 +11313,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11810,7 +11752,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -11854,7 +11795,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12222,7 +12162,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -12264,7 +12203,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -12637,7 +12575,6 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12695,7 +12632,6 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13160,7 +13096,6 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13216,7 +13151,6 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13648,7 +13582,6 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -13941,7 +13874,6 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14236,7 +14168,6 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14534,7 +14465,6 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -14816,7 +14746,6 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15101,7 +15030,6 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15396,7 +15324,6 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15693,7 +15620,6 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16005,7 +15931,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16057,7 +15982,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16505,7 +16429,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16557,7 +16480,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17007,7 +16929,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17059,7 +16980,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17510,7 +17430,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -17561,7 +17480,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -17993,7 +17911,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18044,7 +17961,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18479,7 +18395,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -18530,7 +18445,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -18977,7 +18891,6 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19030,7 +18943,6 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19480,7 +19392,6 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19532,7 +19443,6 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 7930ad8a1540..64d42356e896 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -27,7 +27,6 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -189,7 +188,6 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -353,7 +351,6 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -517,7 +514,6 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -673,7 +669,6 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -832,7 +827,6 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -992,7 +986,6 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1159,7 +1152,6 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1320,7 +1312,6 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1551,7 +1542,6 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1717,7 +1707,6 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1879,7 +1868,6 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2043,7 +2031,6 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2207,7 +2194,6 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2363,7 +2349,6 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2522,7 +2507,6 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2682,7 +2666,6 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2849,7 +2832,6 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -3033,7 +3015,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -3206,7 +3187,6 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3380,7 +3360,6 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3553,7 +3532,6 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -3716,7 +3694,6 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3882,7 +3859,6 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -4049,7 +4025,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -4297,7 +4272,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -4488,7 +4462,6 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -4535,7 +4508,6 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -4933,7 +4905,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -4982,7 +4953,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5394,7 +5364,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5443,7 +5412,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5851,7 +5819,6 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5897,7 +5864,6 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -6282,7 +6248,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6330,7 +6295,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6728,7 +6692,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6776,7 +6739,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7163,7 +7125,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -7199,7 +7160,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -7504,7 +7464,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7539,7 +7498,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7852,7 +7810,6 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7902,7 +7859,6 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8315,7 +8271,6 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8364,7 +8319,6 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8777,7 +8731,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -8831,7 +8784,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9282,7 +9234,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9339,7 +9290,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9803,7 +9753,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9860,7 +9809,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10322,7 +10270,6 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -10374,7 +10321,6 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -10811,7 +10757,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10866,7 +10811,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11315,7 +11259,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11370,7 +11313,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11810,7 +11752,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -11854,7 +11795,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12222,7 +12162,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -12264,7 +12203,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -12637,7 +12575,6 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12695,7 +12632,6 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13160,7 +13096,6 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13216,7 +13151,6 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13648,7 +13582,6 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -13941,7 +13874,6 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14236,7 +14168,6 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14534,7 +14465,6 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -14816,7 +14746,6 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15101,7 +15030,6 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15396,7 +15324,6 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15693,7 +15620,6 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16005,7 +15931,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16057,7 +15982,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16505,7 +16429,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16557,7 +16480,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17007,7 +16929,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17059,7 +16980,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17510,7 +17430,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -17561,7 +17480,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -17993,7 +17911,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18044,7 +17961,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18479,7 +18395,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -18530,7 +18445,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -18977,7 +18891,6 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19030,7 +18943,6 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19480,7 +19392,6 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19532,7 +19443,6 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index b79d0df960a0..940918a5437b 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -44,7 +44,6 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32: @@ -276,7 +275,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos: @@ -510,7 +508,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -752,7 +749,6 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f32: @@ -973,7 +969,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1197,7 +1192,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1432,7 +1426,6 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos: @@ -1667,7 +1660,6 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1899,7 +1891,6 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__ftz: @@ -2131,7 +2122,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -2365,7 +2355,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2607,7 +2596,6 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f32__ftz: @@ -2828,7 +2816,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3052,7 +3039,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -3287,7 +3273,6 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -3522,7 +3507,6 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3754,7 +3738,6 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f64: @@ -4006,7 +3989,6 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos: @@ -4259,7 +4241,6 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -4514,7 +4495,6 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f64: @@ -4744,7 +4724,6 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4977,7 +4956,6 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -5237,7 +5215,6 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16: @@ -5282,7 +5259,6 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f16: @@ -5660,7 +5636,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -5706,7 +5681,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -6096,7 +6070,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -6142,7 +6115,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -6531,7 +6503,6 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16: @@ -6574,7 +6545,6 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f16: @@ -6940,7 +6910,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -6984,7 +6953,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -7361,7 +7329,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -7405,7 +7372,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -7772,7 +7738,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -7806,7 +7771,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -8094,7 +8058,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -8126,7 +8089,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -8419,7 +8381,6 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: @@ -8466,7 +8427,6 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: @@ -8858,7 +8818,6 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: @@ -8903,7 +8862,6 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: @@ -9299,7 +9257,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16: @@ -9353,7 +9310,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_bf16: @@ -9802,7 +9758,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -9859,7 +9814,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -10321,7 +10275,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -10378,7 +10331,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -10838,7 +10790,6 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16: @@ -10890,7 +10841,6 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_bf16: @@ -11325,7 +11275,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -11380,7 +11329,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -11827,7 +11775,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -11882,7 +11829,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -12320,7 +12266,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -12364,7 +12309,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -12730,7 +12674,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -12772,7 +12715,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -13143,7 +13085,6 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -13201,7 +13142,6 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -13664,7 +13604,6 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -13720,7 +13659,6 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -14148,7 +14086,6 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_v2f16: @@ -14424,7 +14361,6 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos: @@ -14702,7 +14638,6 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -14982,7 +14917,6 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_v2f16: @@ -15245,7 +15179,6 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -15511,7 +15444,6 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -15788,7 +15720,6 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos: @@ -16067,7 +15998,6 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -16363,7 +16293,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16: @@ -16415,7 +16344,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_v2bf16: @@ -16863,7 +16791,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -16915,7 +16842,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -17365,7 +17291,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -17417,7 +17342,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -17868,7 +17792,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16: @@ -17919,7 +17842,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_v2bf16: @@ -18351,7 +18273,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -18402,7 +18323,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -18837,7 +18757,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -18888,7 +18807,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -19335,7 +19253,6 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -19388,7 +19305,6 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -19838,7 +19754,6 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -19890,7 +19805,6 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll index 6fd1285c0a34..63d63da56080 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll @@ -114,7 +114,6 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -227,7 +226,6 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX12-NEXT: s_cbranch_execnz .LBB1_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -562,7 +560,6 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -693,7 +690,6 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1042,7 +1038,6 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1155,7 +1150,6 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1490,7 +1484,6 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1621,7 +1614,6 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1970,7 +1962,6 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2083,7 +2074,6 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2418,7 +2408,6 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2549,7 +2538,6 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2898,7 +2886,6 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3011,7 +2998,6 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3346,7 +3332,6 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3477,7 +3462,6 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll index d297955f109a..58f7c4340276 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -147,7 +147,6 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -189,7 +188,6 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(ptr addrspace(1) inreg %s ; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -228,7 +226,6 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(ptr addrspace(1) inreg ; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -295,7 +292,6 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, ; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 @@ -356,7 +352,6 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 ; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 @@ -523,7 +518,6 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -561,7 +555,6 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -677,7 +670,6 @@ define amdgpu_ps float @global_add_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: global_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -715,7 +707,6 @@ define amdgpu_ps float @global_add_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX12-NEXT: global_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -827,7 +818,6 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -865,7 +855,6 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX12-NEXT: global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -981,7 +970,6 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: global_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1019,7 +1007,6 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX12-NEXT: global_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1131,7 +1118,6 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1169,7 +1155,6 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1285,7 +1270,6 @@ define amdgpu_ps float @global_and_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: global_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1323,7 +1307,6 @@ define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX12-NEXT: global_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1435,7 +1418,6 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1473,7 +1455,6 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX12-NEXT: global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1589,7 +1570,6 @@ define amdgpu_ps float @global_or_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i ; GFX12-NEXT: global_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1627,7 +1607,6 @@ define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %s ; GFX12-NEXT: global_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1739,7 +1718,6 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(ptr addrspace(1) inreg %sb ; GFX12-NEXT: global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1777,7 +1755,6 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(ptr addrspace(1) in ; GFX12-NEXT: global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1893,7 +1870,6 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: global_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1931,7 +1907,6 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX12-NEXT: global_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2043,7 +2018,6 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2081,7 +2055,6 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2194,7 +2167,6 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2229,7 +2201,6 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2330,7 +2301,6 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2365,7 +2335,6 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2470,7 +2439,6 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2505,7 +2473,6 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2606,7 +2573,6 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2641,7 +2607,6 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2746,7 +2711,6 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2781,7 +2745,6 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2882,7 +2845,6 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2917,7 +2879,6 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3022,7 +2983,6 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3057,7 +3017,6 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3158,7 +3117,6 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3193,7 +3151,6 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3307,7 +3264,6 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn(ptr addrspace(1) inreg %sba ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3352,7 +3308,6 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn_neg128(ptr addrspace(1) inr ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3487,7 +3442,6 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(ptr addrspace(1) inre ; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3536,7 +3490,6 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(ptr addrspace( ; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 890ebddf3680..eb1d950a9046 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -3771,7 +3771,6 @@ define amdgpu_ps float @atomic_global_load_saddr_i32(ptr addrspace(1) inreg %sba ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3809,7 +3808,6 @@ define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(ptr addrspace(1) ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3848,7 +3846,6 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(ptr addrspace(1) inre ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3886,7 +3883,6 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(ptr addrspa ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index d4867dbaa14b..b91967bca72d 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -127,7 +127,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret void @@ -284,7 +283,6 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst ret i32 %result @@ -872,7 +870,6 @@ define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) { ; GFX12-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll index 614a221d43d5..95e9ab26dbca 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll @@ -88,7 +88,6 @@ define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef r ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !0) @@ -103,7 +102,6 @@ define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr nou ; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !0) @@ -118,7 +116,6 @@ define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr nou ; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !0) @@ -179,7 +176,6 @@ define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef r ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !0) @@ -195,7 +191,6 @@ define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr nou ; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !0) @@ -211,7 +206,6 @@ define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr nou ; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !0) @@ -349,7 +343,7 @@ define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr no ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !1) @@ -364,7 +358,7 @@ define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire( ; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !1) @@ -379,7 +373,7 @@ define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire( ; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !1) @@ -440,7 +434,7 @@ define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr no ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !1) @@ -456,7 +450,7 @@ define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst( ; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !1) @@ -472,7 +466,7 @@ define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst( ; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-system.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-system.ll index c4234cc0de06..b18590fb14a0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-system.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-system.ll @@ -88,7 +88,6 @@ define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef r ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !0) @@ -103,7 +102,6 @@ define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr nou ; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !0) @@ -118,7 +116,6 @@ define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr nou ; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !0) @@ -179,7 +176,6 @@ define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef r ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !0) @@ -195,7 +191,6 @@ define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr nou ; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !0) @@ -211,7 +206,6 @@ define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr nou ; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !0) @@ -349,7 +343,7 @@ define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr no ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !1) @@ -364,7 +358,7 @@ define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire( ; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !1) @@ -379,7 +373,7 @@ define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire( ; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !1) @@ -440,7 +434,7 @@ define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr no ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !1) @@ -456,7 +450,7 @@ define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst( ; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !1) @@ -472,7 +466,7 @@ define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst( ; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !1) diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 0a7e662975b4..96c4c3e74b38 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -28,7 +28,6 @@ define float @local_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_f32: @@ -144,7 +143,6 @@ define float @local_atomic_fadd_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_f32__offset: @@ -261,7 +259,6 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ds_add_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_f32: @@ -375,7 +372,6 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ds_add_f32 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_f32__offset: @@ -511,7 +507,6 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_f64: @@ -703,7 +698,6 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_f64__offset: @@ -894,7 +888,6 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_f64: @@ -1077,7 +1070,6 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_f64__offset: @@ -1279,7 +1271,6 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_f16: @@ -1322,7 +1313,6 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_f16: @@ -1664,7 +1654,6 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_f16__offset: @@ -1709,7 +1698,6 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_f16__offset: @@ -2060,7 +2048,6 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_f16: @@ -2102,7 +2089,6 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_f16: @@ -2432,7 +2418,6 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_f16__offset: @@ -2475,7 +2460,6 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_f16__offset: @@ -2806,7 +2790,6 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_f16__offset__align4: @@ -2840,7 +2823,6 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_f16__offset__align4: @@ -3103,7 +3085,6 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_f16__offset__align4: @@ -3135,7 +3116,6 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_f16__offset__align4: @@ -3413,7 +3393,6 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_bf16: @@ -3465,7 +3444,6 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_bf16: @@ -3869,7 +3847,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_bf16__offset: @@ -3923,7 +3900,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_bf16__offset: @@ -4336,7 +4312,6 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_bf16: @@ -4387,7 +4362,6 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_bf16: @@ -4778,7 +4752,6 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_bf16__offset: @@ -4830,7 +4803,6 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_bf16__offset: @@ -5222,7 +5194,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_bf16__offset__align4: @@ -5265,7 +5236,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_bf16__offset__align4: @@ -5596,7 +5566,6 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_bf16__offset__align4: @@ -5637,7 +5606,6 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_bf16__offset__align4: @@ -5933,7 +5901,6 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_v2f16: @@ -6157,7 +6124,6 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_v2f16__offset: @@ -6381,7 +6347,6 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_v2f16: @@ -6596,7 +6561,6 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: ds_pk_add_f16 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_v2f16__offset: @@ -6817,7 +6781,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_v2bf16: @@ -7171,7 +7134,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_v2bf16__offset: @@ -7526,7 +7488,6 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: ds_pk_add_bf16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_v2bf16: @@ -7867,7 +7828,6 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: ds_pk_add_bf16 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_v2bf16__ofset: @@ -9903,7 +9863,6 @@ define float @local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode: @@ -10019,7 +9978,6 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: ds_add_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 657a90d71b8e..e0745fda6c00 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -28,7 +28,6 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f32: @@ -118,7 +117,6 @@ define float @local_atomic_fmax_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f32__offset: @@ -210,7 +208,6 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ds_max_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f32: @@ -300,7 +297,6 @@ define void @local_atomic_fmax_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ds_max_num_f32 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f32__offset: @@ -397,7 +393,6 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f64: @@ -495,7 +490,6 @@ define double @local_atomic_fmax_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f64__offset: @@ -595,7 +589,6 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f64: @@ -693,7 +686,6 @@ define void @local_atomic_fmax_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f64__offset: @@ -825,7 +817,6 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_f16: @@ -869,7 +860,6 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f16: @@ -1221,7 +1211,6 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_f16__offset: @@ -1267,7 +1256,6 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f16__offset: @@ -1628,7 +1616,6 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_f16: @@ -1671,7 +1658,6 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f16: @@ -2010,7 +1996,6 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_f16__offset: @@ -2055,7 +2040,6 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f16__offset: @@ -2396,7 +2380,6 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_f16__offset__align4: @@ -2431,7 +2414,6 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f16__offset__align4: @@ -2703,7 +2685,6 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_f16__offset__align4: @@ -2737,7 +2718,6 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f16__offset__align4: @@ -3023,7 +3003,6 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_bf16: @@ -3075,7 +3054,6 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_bf16: @@ -3481,7 +3459,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_bf16__offset: @@ -3535,7 +3512,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_bf16__offset: @@ -3950,7 +3926,6 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_bf16: @@ -4001,7 +3976,6 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_bf16: @@ -4394,7 +4368,6 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_bf16__offset: @@ -4446,7 +4419,6 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_bf16__offset: @@ -4840,7 +4812,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_bf16__offset__align4: @@ -4883,7 +4854,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_bf16__offset__align4: @@ -5216,7 +5186,6 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_bf16__offset__align4: @@ -5257,7 +5226,6 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_bf16__offset__align4: @@ -5574,7 +5542,6 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_v2f16: @@ -5846,7 +5813,6 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_v2f16__offset: @@ -6117,7 +6083,6 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_v2f16: @@ -6379,7 +6344,6 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_v2f16__offset: @@ -6668,7 +6632,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_v2bf16: @@ -6720,7 +6683,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_v2bf16: @@ -7146,7 +7108,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_v2bf16__offset: @@ -7198,7 +7159,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_v2bf16__offset: @@ -7621,7 +7581,6 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_v2bf16: @@ -7671,7 +7630,6 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_v2bf16: @@ -8079,7 +8037,6 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_v2bf16__ofset: @@ -8129,7 +8086,6 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_v2bf16__ofset: @@ -8509,7 +8465,6 @@ define float @local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode: @@ -8599,7 +8554,6 @@ define void @local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: ds_max_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 1144291ec95d..a03d02691a8b 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -28,7 +28,6 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f32: @@ -118,7 +117,6 @@ define float @local_atomic_fmin_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f32__offset: @@ -210,7 +208,6 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ds_min_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f32: @@ -300,7 +297,6 @@ define void @local_atomic_fmin_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ds_min_num_f32 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f32__offset: @@ -397,7 +393,6 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f64: @@ -495,7 +490,6 @@ define double @local_atomic_fmin_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f64__offset: @@ -595,7 +589,6 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f64: @@ -693,7 +686,6 @@ define void @local_atomic_fmin_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f64__offset: @@ -825,7 +817,6 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_f16: @@ -869,7 +860,6 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f16: @@ -1221,7 +1211,6 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_f16__offset: @@ -1267,7 +1256,6 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f16__offset: @@ -1628,7 +1616,6 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_f16: @@ -1671,7 +1658,6 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f16: @@ -2010,7 +1996,6 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_f16__offset: @@ -2055,7 +2040,6 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f16__offset: @@ -2396,7 +2380,6 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_f16__offset__align4: @@ -2431,7 +2414,6 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f16__offset__align4: @@ -2703,7 +2685,6 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_f16__offset__align4: @@ -2737,7 +2718,6 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f16__offset__align4: @@ -3023,7 +3003,6 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_bf16: @@ -3075,7 +3054,6 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_bf16: @@ -3481,7 +3459,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_bf16__offset: @@ -3535,7 +3512,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_bf16__offset: @@ -3950,7 +3926,6 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_bf16: @@ -4001,7 +3976,6 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_bf16: @@ -4394,7 +4368,6 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_bf16__offset: @@ -4446,7 +4419,6 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_bf16__offset: @@ -4840,7 +4812,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_bf16__offset__align4: @@ -4883,7 +4854,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_bf16__offset__align4: @@ -5216,7 +5186,6 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_bf16__offset__align4: @@ -5257,7 +5226,6 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_bf16__offset__align4: @@ -5574,7 +5542,6 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_v2f16: @@ -5846,7 +5813,6 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_v2f16__offset: @@ -6117,7 +6083,6 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_v2f16: @@ -6379,7 +6344,6 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_v2f16__offset: @@ -6668,7 +6632,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_v2bf16: @@ -6720,7 +6683,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_v2bf16: @@ -7146,7 +7108,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_v2bf16__offset: @@ -7198,7 +7159,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_v2bf16__offset: @@ -7621,7 +7581,6 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_v2bf16: @@ -7671,7 +7630,6 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_v2bf16: @@ -8079,7 +8037,6 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_v2bf16__ofset: @@ -8129,7 +8086,6 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_v2bf16__ofset: @@ -8509,7 +8465,6 @@ define float @local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode: @@ -8599,7 +8554,6 @@ define void @local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: ds_min_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index dfe37fda186c..27dc54969380 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -44,7 +44,6 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_f32: @@ -256,7 +255,6 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_f32__offset: @@ -467,7 +465,6 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_f32: @@ -668,7 +665,6 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_f32__offset: @@ -877,7 +873,6 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_f64: @@ -1094,7 +1089,6 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_f64__offset: @@ -1310,7 +1304,6 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_f64: @@ -1516,7 +1509,6 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_f64__offset: @@ -1741,7 +1733,6 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_f16: @@ -1784,7 +1775,6 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_f16: @@ -2126,7 +2116,6 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_f16__offset: @@ -2171,7 +2160,6 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_f16__offset: @@ -2522,7 +2510,6 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_f16: @@ -2564,7 +2551,6 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_f16: @@ -2894,7 +2880,6 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_f16__offset: @@ -2937,7 +2922,6 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_f16__offset: @@ -3268,7 +3252,6 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_f16__offset__align4: @@ -3302,7 +3285,6 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_f16__offset__align4: @@ -3565,7 +3547,6 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_f16__offset__align4: @@ -3597,7 +3578,6 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_f16__offset__align4: @@ -3875,7 +3855,6 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_bf16: @@ -3927,7 +3906,6 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_bf16: @@ -4331,7 +4309,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_bf16__offset: @@ -4385,7 +4362,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_bf16__offset: @@ -4798,7 +4774,6 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_bf16: @@ -4849,7 +4824,6 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_bf16: @@ -5240,7 +5214,6 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_bf16__offset: @@ -5292,7 +5265,6 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_bf16__offset: @@ -5684,7 +5656,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_bf16__offset__align4: @@ -5727,7 +5698,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_bf16__offset__align4: @@ -6058,7 +6028,6 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_bf16__offset__align4: @@ -6099,7 +6068,6 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_bf16__offset__align4: @@ -6412,7 +6380,6 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_v2f16: @@ -6667,7 +6634,6 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_v2f16__offset: @@ -6920,7 +6886,6 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_v2f16: @@ -7163,7 +7128,6 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_v2f16__offset: @@ -7436,7 +7400,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_v2bf16: @@ -7488,7 +7451,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_v2bf16: @@ -7914,7 +7876,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_v2bf16__offset: @@ -7966,7 +7927,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_v2bf16__offset: @@ -8389,7 +8349,6 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_v2bf16: @@ -8439,7 +8398,6 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_v2bf16: @@ -8847,7 +8805,6 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_v2bf16__ofset: @@ -8897,7 +8854,6 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_v2bf16__ofset: @@ -9293,7 +9249,6 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode: @@ -9503,7 +9458,6 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode: diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-func-global-inv.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-func-global-inv.mir new file mode 100644 index 000000000000..5068843d1702 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-func-global-inv.mir @@ -0,0 +1,115 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s + +# Test that we can optimize away s_wait_loadcnt at function boundaries when +# the only pending LOAD_CNT events are from GLOBAL_INV (which doesn't write +# to VGPRs). +# +# When a function contains only GLOBAL_INV with no actual VMEM loads pending +# to VGPRs, we should not need to emit s_wait_loadcnt 0 before the return. + +--- +# Test 1: Only GLOBAL_INV, no VGPR loads - should NOT need S_WAIT_LOADCNT +# before return because GLOBAL_INV doesn't write to VGPRs. +name: func_global_inv_only +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: false +body: | + bb.0: + liveins: $sgpr30_sgpr31 + + ; GFX12-LABEL: name: func_global_inv_only + ; GFX12: liveins: $sgpr30_sgpr31 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: GLOBAL_INV 16, implicit $exec + ; GFX12-NOT: S_WAIT_LOADCNT + ; GFX12-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + GLOBAL_INV 16, implicit $exec + S_SETPC_B64_return $sgpr30_sgpr31 +... +--- +# Test 2: GLOBAL_INV with actual VGPR load - MUST wait for loadcnt +name: func_global_inv_with_vgpr_load +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: false +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1, $sgpr30_sgpr31 + + ; GFX12-LABEL: name: func_global_inv_with_vgpr_load + ; GFX12: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr30_sgpr31 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: GLOBAL_INV 16, implicit $exec + ; GFX12-NEXT: S_WAIT_LOADCNT 0 + ; GFX12-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1) + GLOBAL_INV 16, implicit $exec + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 +... +--- +# Test 3: Only VGPR load (no GLOBAL_INV) - MUST wait for loadcnt +name: func_vgpr_load_no_global_inv +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: false +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1, $sgpr30_sgpr31 + + ; GFX12-LABEL: name: func_vgpr_load_no_global_inv + ; GFX12: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr30_sgpr31 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: S_WAIT_LOADCNT 0 + ; GFX12-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 +... +--- +# Test 4: GLOBAL_INV with load already waited on - should NOT need S_WAIT_LOADCNT at return +# The load was waited on when $vgpr0 was used, so only GLOBAL_INV is pending at return. +name: func_global_inv_load_already_waited +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: false +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1, $sgpr30_sgpr31 + + ; GFX12-LABEL: name: func_global_inv_load_already_waited + ; GFX12: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr30_sgpr31 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: S_WAIT_LOADCNT 0 + ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; GFX12-NEXT: GLOBAL_INV 16, implicit $exec + ; GFX12-NOT: S_WAIT_LOADCNT + ; GFX12-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1) + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + GLOBAL_INV 16, implicit $exec + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 +... +